aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/runtime/CL/functions
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-06-25 12:13:49 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-06-29 16:26:41 +0000
commit4a578b923ed000c67fe0bc1433f945aea634ca9c (patch)
treeb7bb041d2e7bfb4b909199f1b889585d237c665d /arm_compute/runtime/CL/functions
parent53832b2bcce44c71fe31a618a81765294df55750 (diff)
downloadComputeLibrary-4a578b923ed000c67fe0bc1433f945aea634ca9c.tar.gz
Port the ClGemmLowp kernels to the new API
Ported kernels: - CLGEMMLowpMatrixMultiplyNativeKernel - CLGEMMLowpMatrixMultiplyReshapedKernel - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMLowpOffsetContributionKernel - CLGEMMLowpOffsetContributionOutputStageKernel - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - CLGEMMLowpQuantizeDownInt32ScaleKernel Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute/runtime/CL/functions')
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h31
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h260
-rw-r--r--arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h68
-rw-r--r--arm_compute/runtime/CL/functions/CLQLSTMLayer.h140
4 files changed, 148 insertions, 351 deletions
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 3d2dbdb104..e62db8e644 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -34,17 +34,17 @@ class CLCompileContext;
class IMemoryManager;
class ICLTensor;
class ITensorInfo;
-class CLGEMMLowpMatrixMultiplyNativeKernel;
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel;
-class CLGEMMLowpOffsetContributionKernel;
-class CLGEMMLowpOffsetContributionOutputStageKernel;
-class CLGEMMLowpMatrixAReductionKernel;
-class CLGEMMLowpMatrixBReductionKernel;
namespace opencl
{
namespace kernels
{
class ClGemmReshapeRhsMatrixKernel;
+class ClGemmLowpMatrixMultiplyNativeKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmLowpOffsetContributionKernel;
+class ClGemmLowpOffsetContributionOutputStageKernel;
+class ClGemmLowpMatrixAReductionKernel;
+class ClGemmLowpMatrixBReductionKernel;
} // namespace kernels
} // namespace opencl
@@ -150,14 +150,14 @@ private:
MemoryGroup _memory_group;
// Kernels used
- std::unique_ptr<opencl::kernels::ClCastKernel> _weights_to_qasymm8;
- std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel> _mm_native_kernel;
- std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel;
- std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
- std::unique_ptr<CLGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
- std::unique_ptr<CLGEMMLowpOffsetContributionKernel> _offset_contribution_kernel;
- std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+ std::unique_ptr<opencl::kernels::ClCastKernel> _weights_to_qasymm8;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel;
+ std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
// Temporary tensors
CLTensor _qasymm8_weights;
@@ -171,7 +171,8 @@ private:
// Tensor pointers
const ICLTensor *_matrix_a;
const ICLTensor *_original_b;
- const ICLTensor *_output;
+ const ICLTensor *_c;
+ ICLTensor *_output;
int32_t _a_offset;
int32_t _b_offset;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index a60992a0f4..e85f2db8a9 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -45,247 +45,28 @@ class ICLTensor;
class ITensorInfo;
struct GEMMLowpOutputStageInfo;
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
- *
- * CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- * result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following CL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
- int max = std::numeric_limits<int32_t>::max());
- /** Initialise the kernel's inputs, output
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint
- *
- * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor info. Data type supported: QSYMM16
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
*
* This function calls the following CL kernels:
*
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
*/
-class CLGEMMLowpOutputStage : public ICLSimpleFunction
+class CLGEMMLowpOutputStage : public IFunction
{
public:
+ CLGEMMLowpOutputStage();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete;
+ /** Default move constructor */
+ CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete;
+ /** Default move assignment operator */
+ CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&);
+ /** Default destructor */
+ ~CLGEMMLowpOutputStage();
/** Initialise the kernel's inputs, output
*
* Valid data layouts:
@@ -315,7 +96,7 @@ public:
* @param[in] info GEMMLowp output stage metadata.
*/
void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
*
* @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
* @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
@@ -326,6 +107,15 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ std::unique_ptr<ICLKernel> _kernel;
+ const ICLTensor *_input;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 2ef7427a5a..9c004b85d0 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -47,16 +47,16 @@ class ICLTensor;
*
* This function calls the following CL functions/kernels:
*
- * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
- * -# @ref CLTranspose Matrix transpose
- * -# @ref CLConcatenateLayer Tensor concatenation
- * -# @ref CLActivationLayer Activation functions (tanh and logistic)
- * -# @ref CLArithmeticAddition Elementwise addition
- * -# @ref CLPixelWiseMultiplication Elementwise multiplication
- * -# @ref CLSlice Tensor slicing
- * -# @ref CLDequantizationLayer Dequantize into float
- * -# @ref CLQuantizationLayer Quantize from float
+ * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
+ * -# @ref CLTranspose Matrix transpose
+ * -# @ref CLConcatenateLayer Tensor concatenation
+ * -# @ref CLActivationLayer Activation functions (tanh and logistic)
+ * -# @ref CLArithmeticAddition Elementwise addition
+ * -# @ref CLPixelWiseMultiplication Elementwise multiplication
+ * -# @ref CLSlice Tensor slicing
+ * -# @ref CLDequantizationLayer Dequantize into float
+ * -# @ref CLQuantizationLayer Quantize from float
* */
class CLLSTMLayerQuantized : public IFunction
{
@@ -170,30 +170,30 @@ private:
MemoryGroup _memory_group;
// Functions used
- CLGEMMLowpMatrixMultiplyCore _gemmlowp;
- CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
- CLTranspose _transpose_weights;
- CLConcatenateLayer _concat_input_weights;
- CLConcatenateLayer _concat_recurrent_weights;
- CLConcatenateLayer _concat_weights;
- CLConcatenateLayer _concat_inputs;
- CLConcatenateLayer _concat_bias;
- CLActivationLayer _sigmoid_forget_gate;
- CLActivationLayer _sigmoid_input_gate;
- CLActivationLayer _sigmoid_output_gate;
- CLActivationLayer _tanh_modulation_gate;
- CLActivationLayer _tanh_output_state;
- CLArithmeticAddition _add_cell_state_tmps;
- CLArithmeticAddition _add2;
- CLPixelWiseMultiplication _mul_forget_gate_cell_state;
- CLPixelWiseMultiplication _mul_input_gate_input_mod_gate;
- CLPixelWiseMultiplication _mul_output_state_tmp_output_gate;
- CLSlice _slice_input_tensor;
- CLSlice _slice_forget_tensor;
- CLSlice _slice_cell_tensor;
- CLSlice _slice_output_tensor;
- CLDequantizationLayer _dequantize;
- CLQuantizationLayer _quantize;
+ CLGEMMLowpMatrixMultiplyCore _gemmlowp;
+ CLGEMMLowpOutputStage _output_stage;
+ CLTranspose _transpose_weights;
+ CLConcatenateLayer _concat_input_weights;
+ CLConcatenateLayer _concat_recurrent_weights;
+ CLConcatenateLayer _concat_weights;
+ CLConcatenateLayer _concat_inputs;
+ CLConcatenateLayer _concat_bias;
+ CLActivationLayer _sigmoid_forget_gate;
+ CLActivationLayer _sigmoid_input_gate;
+ CLActivationLayer _sigmoid_output_gate;
+ CLActivationLayer _tanh_modulation_gate;
+ CLActivationLayer _tanh_output_state;
+ CLArithmeticAddition _add_cell_state_tmps;
+ CLArithmeticAddition _add2;
+ CLPixelWiseMultiplication _mul_forget_gate_cell_state;
+ CLPixelWiseMultiplication _mul_input_gate_input_mod_gate;
+ CLPixelWiseMultiplication _mul_output_state_tmp_output_gate;
+ CLSlice _slice_input_tensor;
+ CLSlice _slice_forget_tensor;
+ CLSlice _slice_cell_tensor;
+ CLSlice _slice_output_tensor;
+ CLDequantizationLayer _dequantize;
+ CLQuantizationLayer _quantize;
// Tensor pointers
const ICLTensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index bd00d56468..1b0b759d74 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -40,9 +40,15 @@ namespace arm_compute
// Forward declarations
class CLCompileContext;
class ICLTensor;
-class CLGEMMLowpMatrixAReductionKernel;
class CLQLSTMLayerNormalizationKernel;
class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace opencl
/** Basic function to run @ref CLQLSTMLayer
*
@@ -52,8 +58,8 @@ class ITensorInfo;
* -# @ref CLCopy Copy function for copying output_state_out to output
* -# @ref CLArithmeticAddition Elementwise addition and subtraction
* -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
- * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
+ * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
+ * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel For precomputing effective biases to use
* -# @ref CLPixelWiseMultiplication Elementwise multiplication
* -# @ref CLTranspose Transpose function for reshaping the weights
* */
@@ -297,70 +303,70 @@ private:
};
// Functions used
- CLTranspose _transpose_input_to_forget_weights{};
- CLTranspose _transpose_input_to_cell_weights{};
- CLTranspose _transpose_input_to_output_weights{};
- CLTranspose _transpose_input_to_input_weights{};
- CLTranspose _transpose_recurrent_to_forget_weights{};
- CLTranspose _transpose_recurrent_to_cell_weights{};
- CLTranspose _transpose_recurrent_to_output_weights{};
- CLTranspose _transpose_recurrent_to_input_weights{};
- CLTranspose _transpose_projection_weights{};
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
- std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction;
- CLArithmeticAddition _projection_bias_add{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
- CLGEMMLowpOutputStage _input_to_forget_outstage{};
- CLGEMMLowpOutputStage _recurrent_to_forget_outstage{};
- CLGEMMLowpOutputStage _cell_to_forget_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_forget{};
- CLArithmeticAddition _accumulate_cell_forget{};
- CLActivationLayer _forget_gate_sigmoid{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{};
- CLGEMMLowpOutputStage _input_to_cell_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{};
- CLGEMMLowpOutputStage _recurrent_to_cell_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_modulation{};
- CLActivationLayer _cell_gate_tanh{};
- CLArithmeticSubtraction _input_gate_sub{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{};
- CLGEMMLowpOutputStage _input_to_input_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
- CLGEMMLowpOutputStage _recurrent_to_input_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_input{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
- CLGEMMLowpOutputStage _cell_to_input_outstage{};
- CLArithmeticAddition _accumulate_cell_input{};
- CLActivationLayer _input_gate_sigmoid{};
- CLPixelWiseMultiplication _pixelwise_mul_forget_cell{};
- CLPixelWiseMultiplication _pixelwise_mul_input_cell{};
- CLArithmeticAddition _add_forget_cell{};
- CLActivationLayer _cell_clip{};
- CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
- CLGEMMLowpOutputStage _input_to_output_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
- CLGEMMLowpOutputStage _recurrent_to_output_outstage{};
- CLArithmeticAddition _accumulate_input_recurrent_output{};
- CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
- CLGEMMLowpOutputStage _cell_to_output_outstage{};
- CLArithmeticAddition _accumulate_cell_to_output{};
- CLActivationLayer _output_gate_sigmoid{};
- CLActivationLayer _hidden_tanh{};
- CLPixelWiseMultiplication _pixelwise_mul_hidden{};
- CLGEMMLowpOutputStage _hidden_outstage{};
- CLGEMMLowpMatrixMultiplyCore _mm_projection{};
- CLGEMMLowpOutputStage _projection_outstage{};
- CLArithmeticAddition _accumulate_projection{};
- CLActivationLayer _projection_clip{};
+ CLTranspose _transpose_input_to_forget_weights{};
+ CLTranspose _transpose_input_to_cell_weights{};
+ CLTranspose _transpose_input_to_output_weights{};
+ CLTranspose _transpose_input_to_input_weights{};
+ CLTranspose _transpose_recurrent_to_forget_weights{};
+ CLTranspose _transpose_recurrent_to_cell_weights{};
+ CLTranspose _transpose_recurrent_to_output_weights{};
+ CLTranspose _transpose_recurrent_to_input_weights{};
+ CLTranspose _transpose_projection_weights{};
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+ std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction;
+ CLArithmeticAddition _projection_bias_add{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
+ CLGEMMLowpOutputStage _input_to_forget_outstage{};
+ CLGEMMLowpOutputStage _recurrent_to_forget_outstage{};
+ CLGEMMLowpOutputStage _cell_to_forget_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_forget{};
+ CLArithmeticAddition _accumulate_cell_forget{};
+ CLActivationLayer _forget_gate_sigmoid{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{};
+ CLGEMMLowpOutputStage _input_to_cell_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{};
+ CLGEMMLowpOutputStage _recurrent_to_cell_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_modulation{};
+ CLActivationLayer _cell_gate_tanh{};
+ CLArithmeticSubtraction _input_gate_sub{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{};
+ CLGEMMLowpOutputStage _input_to_input_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
+ CLGEMMLowpOutputStage _recurrent_to_input_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_input{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
+ CLGEMMLowpOutputStage _cell_to_input_outstage{};
+ CLArithmeticAddition _accumulate_cell_input{};
+ CLActivationLayer _input_gate_sigmoid{};
+ CLPixelWiseMultiplication _pixelwise_mul_forget_cell{};
+ CLPixelWiseMultiplication _pixelwise_mul_input_cell{};
+ CLArithmeticAddition _add_forget_cell{};
+ CLActivationLayer _cell_clip{};
+ CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
+ CLGEMMLowpOutputStage _input_to_output_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
+ CLGEMMLowpOutputStage _recurrent_to_output_outstage{};
+ CLArithmeticAddition _accumulate_input_recurrent_output{};
+ CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
+ CLGEMMLowpOutputStage _cell_to_output_outstage{};
+ CLArithmeticAddition _accumulate_cell_to_output{};
+ CLActivationLayer _output_gate_sigmoid{};
+ CLActivationLayer _hidden_tanh{};
+ CLPixelWiseMultiplication _pixelwise_mul_hidden{};
+ CLGEMMLowpOutputStage _hidden_outstage{};
+ CLGEMMLowpMatrixMultiplyCore _mm_projection{};
+ CLGEMMLowpOutputStage _projection_outstage{};
+ CLArithmeticAddition _accumulate_projection{};
+ CLActivationLayer _projection_clip{};
std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
CLCopy _copy_output;