aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2021-06-18 15:47:28 +0100
committerManuel Bottini <manuel.bottini@arm.com>2021-07-08 14:47:38 +0000
commitcfac51c779f9bf05e8b2d386fbfb4022767d1d30 (patch)
tree6ded148068c32bb1b2926946f59d0262d928b9ab
parent06ac6e438fc95aa7f8228be8217e0776d692b8e7 (diff)
downloadComputeLibrary-cfac51c779f9bf05e8b2d386fbfb4022767d1d30.tar.gz
Port NEGEMMLowp Part 2
Details: Extend NEConvertQuantizedSignednessKernel Port NEGEMMInterleave4x4Kernel to CpuGemmInterleave4x4Kernel Port NEGEMMTranspose1xWKernel to CpuGemmTranspose1xWKernel Port NEGEMMLowpMatrixAReductionKernel to CpuGemmLowpMatrixAReductionKernel Port NEGEMMLowpMatrixBReductionKernel to CpuGemmLowpMatrixBReductionKernel Port NEGEMMLowpOffsetContributionOutputStageKernel to CpuGemmLowpOffsetContributionOutputStageKernel Port NEGEMMLowpOffsetContributionKernel to CpuGemmLowpOffsetContributionKernel Resolves: COMPMID-4403 Change-Id: I3227f052f25e7b41d073bbea1da8a881fcd78b8e Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5875 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--Android.bp11
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h20
-rw-r--r--arm_compute/runtime/NEON/functions/NEQLSTMLayer.h139
-rw-r--r--docs/user_guide/release_version_and_change_log.dox14
-rw-r--r--filelist.json15
-rw-r--r--src/core/NEON/NEKernels.h5
-rw-r--r--src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h78
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h92
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h135
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.h196
-rw-r--r--src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp (renamed from src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp)70
-rw-r--r--src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h63
-rw-r--r--src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h1
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp (renamed from src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp)109
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h80
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp (renamed from src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp)184
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h157
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp (renamed from src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp)66
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h (renamed from src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h)71
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp (renamed from src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp)115
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h114
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h2
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h2
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h2
-rw-r--r--src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h2
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp664
-rw-r--r--src/runtime/NEON/functions/NEQLSTMLayer.cpp145
-rw-r--r--src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp717
-rw-r--r--src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h174
-rw-r--r--src/runtime/gpu/cl/utils/ClAuxTensorHandler.h4
-rw-r--r--tests/validation/NEON/GEMMLowp.cpp101
31 files changed, 1974 insertions, 1574 deletions
diff --git a/Android.bp b/Android.bp
index fb8cb529e2..90facab3ca 100644
--- a/Android.bp
+++ b/Android.bp
@@ -146,7 +146,6 @@ cc_library_static {
"src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp",
"src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp",
"src/core/NEON/kernels/NECol2ImKernel.cpp",
- "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp",
"src/core/NEON/kernels/NECropKernel.cpp",
"src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
"src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
@@ -154,10 +153,6 @@ cc_library_static {
"src/core/NEON/kernels/NEFFTScaleKernel.cpp",
"src/core/NEON/kernels/NEFillBorderKernel.cpp",
"src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp",
"src/core/NEON/kernels/NEGatherKernel.cpp",
"src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp",
"src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
@@ -258,6 +253,7 @@ cc_library_static {
"src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp",
"src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
"src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
+ "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp",
"src/core/cpu/kernels/CpuCopyKernel.cpp",
"src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
"src/core/cpu/kernels/CpuDequantizeKernel.cpp",
@@ -268,6 +264,10 @@ cc_library_static {
"src/core/cpu/kernels/CpuFillKernel.cpp",
"src/core/cpu/kernels/CpuFloorKernel.cpp",
"src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
@@ -645,6 +645,7 @@ cc_library_static {
"src/runtime/cpu/operators/CpuFloor.cpp",
"src/runtime/cpu/operators/CpuGemm.cpp",
"src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
+ "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp",
"src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp",
"src/runtime/cpu/operators/CpuMul.cpp",
"src/runtime/cpu/operators/CpuPermute.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 60cfd8f91d..896ef60d6f 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -24,6 +24,7 @@
#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IWeightsManager.h"
@@ -33,19 +34,14 @@
namespace arm_compute
{
class ITensor;
-/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
- *
- * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel
- * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel
- * -# @ref NEGEMMLowpMatrixMultiplyKernel
- * -# @ref NEGEMMLowpOffsetContributionKernel
- * -# @ref NEActivationLayer
- *
- * otherwise if the DOT product instruction is available:
+class ITensorInfo;
+
+/** Function to run Gemm on quantized types.
*
- * -# @ref NEGEMMLowpOffsetContributionKernel
+ * This function calls the following:
*
-*/
+ * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore
+ */
class NEGEMMLowpMatrixMultiplyCore : public IFunction
{
public:
@@ -114,7 +110,7 @@ public:
private:
struct Impl;
- std::unique_ptr<struct Impl> _impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 77adffd543..acbd92cff7 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -43,8 +43,13 @@ namespace arm_compute
class ITensor;
class ITensorInfo;
class NEQLSTMLayerNormalizationKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace cpu
/** Basic function to run @ref NEQLSTMLayer
*
* This function calls the following kernels:
@@ -55,7 +60,7 @@ class NEGEMMLowpMatrixAReductionKernel;
* -# @ref NECopy Copy kernel for copying output_state_out to output
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
* -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
- * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
+ * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel For precomputing effective biases to use
* -# @ref NEPixelWiseMultiplication Elementwise multiplication
* -# @ref NETranspose Transpose function for reshaping the weights
* */
@@ -250,70 +255,70 @@ private:
};
// Functions used
- NETranspose _transpose_input_to_forget_weights;
- NETranspose _transpose_input_to_cell_weights;
- NETranspose _transpose_input_to_output_weights;
- NETranspose _transpose_input_to_input_weights;
- NETranspose _transpose_recurrent_to_forget_weights;
- NETranspose _transpose_recurrent_to_cell_weights;
- NETranspose _transpose_recurrent_to_output_weights;
- NETranspose _transpose_recurrent_to_input_weights;
- NETranspose _transpose_projection_weights;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction;
- NEArithmeticAddition _projection_bias_add;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget;
- NEGEMMLowpOutputStage _input_to_forget_outstage;
- NEGEMMLowpOutputStage _recurrent_to_forget_outstage;
- NEGEMMLowpOutputStage _cell_to_forget_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_forget;
- NEArithmeticAddition _accumulate_cell_forget;
- NEActivationLayer _forget_gate_sigmoid;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell;
- NEGEMMLowpOutputStage _input_to_cell_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell;
- NEGEMMLowpOutputStage _recurrent_to_cell_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_modulation;
- NEActivationLayer _cell_gate_tanh;
- NEArithmeticSubtraction _input_gate_sub;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_input;
- NEGEMMLowpOutputStage _input_to_input_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input;
- NEGEMMLowpOutputStage _recurrent_to_input_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_input;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_input;
- NEGEMMLowpOutputStage _cell_to_input_outstage;
- NEArithmeticAddition _accumulate_cell_input;
- NEActivationLayer _input_gate_sigmoid;
- NEPixelWiseMultiplication _pixelwise_mul_forget_cell;
- NEPixelWiseMultiplication _pixelwise_mul_input_cell;
- NEArithmeticAddition _add_forget_cell;
- NEActivationLayer _cell_clip;
- NEGEMMLowpMatrixMultiplyCore _mm_input_to_output;
- NEGEMMLowpOutputStage _input_to_output_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output;
- NEGEMMLowpOutputStage _recurrent_to_output_outstage;
- NEArithmeticAddition _accumulate_input_recurrent_output;
- NEPixelWiseMultiplication _pixelwise_mul_cell_to_output;
- NEGEMMLowpOutputStage _cell_to_output_outstage;
- NEArithmeticAddition _accumulate_cell_to_output;
- NEActivationLayer _output_gate_sigmoid;
- NEActivationLayer _hidden_tanh;
- NEPixelWiseMultiplication _pixelwise_mul_hidden;
- NEGEMMLowpOutputStage _hidden_outstage;
- NEGEMMLowpMatrixMultiplyCore _mm_projection;
- NEGEMMLowpOutputStage _projection_outstage;
- NEArithmeticAddition _accumulate_projection;
- NEActivationLayer _projection_clip;
+ NETranspose _transpose_input_to_forget_weights;
+ NETranspose _transpose_input_to_cell_weights;
+ NETranspose _transpose_input_to_output_weights;
+ NETranspose _transpose_input_to_input_weights;
+ NETranspose _transpose_recurrent_to_forget_weights;
+ NETranspose _transpose_recurrent_to_cell_weights;
+ NETranspose _transpose_recurrent_to_output_weights;
+ NETranspose _transpose_recurrent_to_input_weights;
+ NETranspose _transpose_projection_weights;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+ std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction;
+ NEArithmeticAddition _projection_bias_add;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget;
+ NEGEMMLowpOutputStage _input_to_forget_outstage;
+ NEGEMMLowpOutputStage _recurrent_to_forget_outstage;
+ NEGEMMLowpOutputStage _cell_to_forget_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_forget;
+ NEArithmeticAddition _accumulate_cell_forget;
+ NEActivationLayer _forget_gate_sigmoid;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell;
+ NEGEMMLowpOutputStage _input_to_cell_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell;
+ NEGEMMLowpOutputStage _recurrent_to_cell_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_modulation;
+ NEActivationLayer _cell_gate_tanh;
+ NEArithmeticSubtraction _input_gate_sub;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_input;
+ NEGEMMLowpOutputStage _input_to_input_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input;
+ NEGEMMLowpOutputStage _recurrent_to_input_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_input;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_input;
+ NEGEMMLowpOutputStage _cell_to_input_outstage;
+ NEArithmeticAddition _accumulate_cell_input;
+ NEActivationLayer _input_gate_sigmoid;
+ NEPixelWiseMultiplication _pixelwise_mul_forget_cell;
+ NEPixelWiseMultiplication _pixelwise_mul_input_cell;
+ NEArithmeticAddition _add_forget_cell;
+ NEActivationLayer _cell_clip;
+ NEGEMMLowpMatrixMultiplyCore _mm_input_to_output;
+ NEGEMMLowpOutputStage _input_to_output_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output;
+ NEGEMMLowpOutputStage _recurrent_to_output_outstage;
+ NEArithmeticAddition _accumulate_input_recurrent_output;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_output;
+ NEGEMMLowpOutputStage _cell_to_output_outstage;
+ NEArithmeticAddition _accumulate_cell_to_output;
+ NEActivationLayer _output_gate_sigmoid;
+ NEActivationLayer _hidden_tanh;
+ NEPixelWiseMultiplication _pixelwise_mul_hidden;
+ NEGEMMLowpOutputStage _hidden_outstage;
+ NEGEMMLowpMatrixMultiplyCore _mm_projection;
+ NEGEMMLowpOutputStage _projection_outstage;
+ NEArithmeticAddition _accumulate_projection;
+ NEActivationLayer _projection_clip;
TensorCopyKernel _projection_bias_copy;
TensorCopyKernel _projection_output_to_accumulate_copy;
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 78c13041ee..e948c2f062 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -243,17 +243,17 @@ v20.11 Public major release
- NEDirectConvolutionLayerKernel
- NEScaleKernel
- NELocallyConnectedMatrixMultiplyKernel
- - @ref NEGEMMLowpOffsetContributionKernel
+ - NEGEMMLowpOffsetContributionKernel
- NEGEMMTranspose1xWKernel
- NEPoolingLayerKernel
- NEConvolutionKernel
- NEDepthwiseConvolutionLayerNativeKernel
- - @ref NEGEMMLowpMatrixMultiplyKernel
+ - NEGEMMLowpMatrixMultiplyKernel
- NEGEMMMatrixMultiplyKernel
- NEDirectConvolutionLayerOutputStageKernel
- @ref NEReductionOperationKernel
- - @ref NEGEMMLowpMatrixAReductionKernel
- - @ref NEGEMMLowpMatrixBReductionKernel
+ - NEGEMMLowpMatrixAReductionKernel
+ - NEGEMMLowpMatrixBReductionKernel
- Removed padding from OpenCL kernels:
- CLBatchConcatenateLayerKernel
- CLElementwiseOperationKernel
@@ -848,7 +848,7 @@ v19.05 Public major release
- @ref NEFFTDigitReverseKernel
- @ref NEFFTRadixStageKernel
- @ref NEFFTScaleKernel
- - @ref NEGEMMLowpOffsetContributionOutputStageKernel
+ - NEGEMMLowpOffsetContributionOutputStageKernel
- NEHeightConcatenateLayerKernel
- @ref NESpaceToBatchLayerKernel / @ref NESpaceToBatchLayer
- @ref NEFFT1D
@@ -1242,7 +1242,7 @@ v17.12 Public major release
- arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
- arm_compute::NEHGEMMAArch64FP16Kernel
- NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
- - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
+ - NEGEMMLowpOffsetContributionKernel / NEGEMMLowpMatrixAReductionKernel / NEGEMMLowpMatrixBReductionKernel / NEGEMMLowpMatrixMultiplyCore
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- NEWinogradLayer / NEWinogradLayerKernel
@@ -1364,7 +1364,7 @@ v17.03.1 First Major public release of the sources
- NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer
- NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
- NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer
- - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
+ - NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
v17.03 Sources preview
- New OpenCL kernels / functions:
diff --git a/filelist.json b/filelist.json
index 061c3e885f..56274954c8 100644
--- a/filelist.json
+++ b/filelist.json
@@ -844,7 +844,7 @@
"ConvertQuantizedSignedness": {
"files": {
"kernel": [
- "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp"
+ "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp"
]
}
},
@@ -1132,16 +1132,19 @@
"GemmAssemblyDispatch"
],
"files": {
- "operator" : ["src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp"],
+ "operator" : [
+ "src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp",
+ "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp"
+ ],
"kernel": [
- "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
"src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp",
- "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp"
+ "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp",
+ "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp"
]
}
},
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 69c8d7bebc..cd09544d31 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -33,7 +33,6 @@
#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
#include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
#include "src/core/NEON/kernels/NECropKernel.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
@@ -41,10 +40,6 @@
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
#include "src/core/NEON/kernels/NEGatherKernel.h"
#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
deleted file mode 100644
index 67d5ca246e..0000000000
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class NEConvertQuantizedSignednessKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEConvertQuantizedSignednessKernel";
- }
- /** Default constructor */
- NEConvertQuantizedSignednessKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers). */
- NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
- /** Default destructor */
- ~NEConvertQuantizedSignednessKernel() = default;
- /** Initialize the kernel's input, output.
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
- * @param[out] output Destination tensor. Data types supported: opposite of @p input.
- */
- void configure(const ITensor *input, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEConvertQuantizedSignednessKernel
- *
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
- * @param[in] output Destination tensor. Data types supported: opposite of @p input.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input;
- ITensor *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index b9a1b5e840..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to multiply matrices
- *
- * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
- * This kernel performs the following computation:
- *
- * -# Convert a values from int8 to int32
- * -# Convert b values from int8 to int32
- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpMatrixMultiplyKernel";
- }
- /** Constructor */
- NEGEMMLowpMatrixMultiplyKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Default destructor */
- ~NEGEMMLowpMatrixMultiplyKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * The input matrices @p input0 and @p input1 must be the output of the kernels: cpu::kernels::CpuGemmInterleave4x4Kernel and @ref cpu::kernels::CpuGemmTranspose1xWKernel. These two
- * kernels change the layout of the original matrices to be more cache-friendly.
- *
- * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- */
- void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
- *
- * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
- * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input0;
- const ITensor *_input1;
- ITensor *_output;
- bool _slide_matrix_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 6908f37aad..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
- *
- * The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
- * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
- *
- * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
- *
- * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
- *
- * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * and mm_result'[i][k] = mm_result[i][k] +
- * (vector_sum_col[k] * a_offset) +
- * (vector_sum_row[i] * b_offset) +
- * (a_offset * b_offset * k)
- */
-
-class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpOffsetContributionOutputStageKernel";
- }
- /** Constructor */
- NEGEMMLowpOffsetContributionOutputStageKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
- /** Default destructor */
- ~NEGEMMLowpOffsetContributionOutputStageKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
- * @param[out] output Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] k Number of matrix A columns or Matrix B rows
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
- */
- void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
- GEMMLowpOutputStageInfo output_stage);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel
- *
- * @param[in] mm_result Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
- * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
- * @param[in] output Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
- int32_t b_offset,
- GEMMLowpOutputStageInfo output_stage);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- /** Function to use for the particular tensors passed to configure() */
- const ITensor *_vector_sum_col;
- const ITensor *_vector_sum_row;
- const ITensor *_bias;
- const ITensor *_mm_result;
- ITensor *_output;
- int32_t _a_offset;
- int32_t _b_offset;
- int32_t _k_offset;
- bool _slide_vector_sum_col;
- GEMMLowpOutputStageInfo _output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
deleted file mode 100644
index 9be618d656..0000000000
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all reduction kernels */
-class INEGEMMLowpReductionKernel : public INEKernel
-{
-public:
- /** Constructor */
- INEGEMMLowpReductionKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
- /** Allow instances of this class to be moved */
- INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
- /** Allow instances of this class to be moved */
- INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
- /** Default destructor */
- virtual ~INEGEMMLowpReductionKernel() = default;
-
- /** Initialise the kernel's input and output.
- *
- * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k Number of matrix columns/rows depending on the type of reduction.
- * - is_reshaped True if the matrix has been reshaped.
- * - scalar Scalar value to multiply each reduced column/row by.
- * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
- */
- virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
- const ITensor *_input;
- ITensor *_output;
- int32_t _k;
- int32_t _scalar;
- bool _mul_by_scalar;
-};
-
-/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpMatrixAReductionKernel";
- }
- /** Default constructor */
- NEGEMMLowpMatrixAReductionKernel() = default;
- /** Prevent instances of this class from being copied */
- NEGEMMLowpMatrixAReductionKernel(const NEGEMMLowpMatrixAReductionKernel &) = delete;
- /** Prevent instances of this class from being copied */
- NEGEMMLowpMatrixAReductionKernel &operator=(const NEGEMMLowpMatrixAReductionKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixAReductionKernel(NEGEMMLowpMatrixAReductionKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixAReductionKernel &operator=(NEGEMMLowpMatrixAReductionKernel &&) = default;
- /** Default destructor */
- ~NEGEMMLowpMatrixAReductionKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k (num_mtx_a_cols) Number of matrix A columns
- * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4
- * - scalar Scalar value to multiply each reduced row by.
- * - mul_byscalar True if each reduced column must be multiplied by a scalar value.
- */
- void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
- *
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k (num_mtx_a_cols) Number of matrix A columns
- * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4
- * - scalar Scalar value to multiply each reduced row by.
- * - mul_byscalar True if each reduced column must be multiplied by a scalar value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- /** Execution of the reduction kernel specialized on the input type
- *
- * @param[in] window Execution window
- */
- template <typename T>
- void run_internal(const Window &window);
-};
-
-/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpMatrixBReductionKernel";
- }
- /** Default constructor */
- NEGEMMLowpMatrixBReductionKernel() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpMatrixBReductionKernel(const NEGEMMLowpMatrixBReductionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpMatrixBReductionKernel &operator=(const NEGEMMLowpMatrixBReductionKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixBReductionKernel(NEGEMMLowpMatrixBReductionKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpMatrixBReductionKernel &operator=(NEGEMMLowpMatrixBReductionKernel &&) = default;
- /** Default destructor */
- ~NEGEMMLowpMatrixBReductionKernel() = default;
- /** Initialise the kernel's input and output.
- *
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k (num_mtx_b_rows) Number of matrix B rows.
- * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW.
- * - scalar Scalar value to multiply each reduced row by.
- * - mul_byscalar True if each reduced row must be multiplied by a scalar value.
- */
- void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
- *
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
- * @param[in] info Kernel metadata:
- * - k (num_mtx_b_rows) Number of matrix B rows.
- * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW.
- * - scalar Scalar value to multiply each reduced row by.
- * - mul_byscalar True if each reduced row must be multiplied by a scalar value.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- /** Execution of the reduction kernel specialized on the input type
- *
- * @param[in] window Execution window
- * @param[in] info Thread-related information
- */
- template <typename T>
- void run_internal(const Window &window, const ThreadInfo &info);
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
index 1f2170f42a..26cbb48deb 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -35,76 +35,73 @@
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
// Validate output if initialized
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
{
// Output auto inizialitation if not yet initialized
{
- const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED;
+ const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo qinfo = input->quantization_info().uniform();
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
const int offset_correction = is_input_signed ? -128 : 128;
const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
- auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+ auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
}
- return std::make_pair(Status{}, calculate_max_window(*output));
+ return std::make_pair(Status{}, calculate_max_window(*dst));
}
} // namespace
-NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
- : _input(nullptr), _output(nullptr)
-{
-}
-
-void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
+void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
- _input = input;
- _output = output;
-
- std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
+ std::pair<Status, Window> win_config = validate_and_configure_window(src, dst);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ ICpuKernel::configure(win_config.second);
}
-Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
return Status{};
}
-void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
+void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(_input, win_collapsed);
- Iterator output(_output, win_collapsed);
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -135,4 +132,11 @@ void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadI
},
input, output);
}
+
+const char *CpuConvertQuantizedSignednessKernel::name() const
+{
+ return "CpuConvertQuantizedSignednessKernel";
+}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
new file mode 100644
index 0000000000..2a8f6c364d
--- /dev/null
+++ b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
+#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+class CpuConvertQuantizedSignednessKernel : public ICpuKernel
+{
+public:
+ CpuConvertQuantizedSignednessKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel);
+ /** Initialize the kernel input and output info.
+ *
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[out] dst Destination tensor info. Data types supported: opposite of @p src.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuConvertQuantizedSignednessKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 8f1a54314a..0c55886d8d 100644
--- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -56,7 +56,6 @@ class CpuGemmInterleave4x4Kernel : public ICpuKernel
{
public:
CpuGemmInterleave4x4Kernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmInterleave4x4Kernel);
/** Initialise the kernel's src and dst.
*
* @param[in] src Input tensor info. Data types supported: All
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index 6bcf59ee96..35e542faa4 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -36,10 +36,12 @@
#include <arm_neon.h>
-using namespace arm_compute;
-
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
@@ -860,19 +862,16 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
},
ina, inb, out);
}
-} // namespace
-namespace
-{
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
- TensorShape in0_shape = input0->tensor_shape();
- TensorShape in1_shape = input1->tensor_shape();
- TensorShape out_shape = output->tensor_shape();
+ TensorShape in0_shape = src0->tensor_shape();
+ TensorShape in1_shape = src1->tensor_shape();
+ TensorShape out_shape = dst->tensor_shape();
// Check vector-by-matrix case
if(out_shape[1] == 1)
@@ -894,63 +893,58 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
}
} // namespace
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
+void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
-}
+ ARM_COMPUTE_UNUSED(src0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst));
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
- TensorShape in1_shape = input1->info()->tensor_shape();
+ TensorShape in1_shape = src1->tensor_shape();
in1_shape.collapse(2);
- _input0 = input0;
- _input1 = input1;
- _output = output;
_slide_matrix_b = in1_shape[2] != 1;
constexpr unsigned int num_elems_processed_per_iteration_x = 16;
constexpr unsigned int num_elems_processed_per_iteration_y = 4;
Window win;
-
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
- if((output->info()->dimension(1) == 1))
+ if((dst->dimension(1) == 1))
{
// Configure kernel window
- win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+ win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
}
else
{
- win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
}
- INEKernel::configure(win);
+ ICpuKernel::configure(win);
}
-Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
return Status{};
}
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+ auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
- if((_output->info()->dimension(1) == 1))
+ if((dst->info()->dimension(1) == 1))
{
- const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
- const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
- const auto width_out = static_cast<int>(_output->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
+ const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
+ const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
+ const auto width_out = static_cast<int>(dst->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
// The implementation computes 16 elements per iteration
const int window_start_x = 16 * info.thread_id;
@@ -969,18 +963,18 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(_input1->info()->num_dimensions() >= 3)
+ if(src1->info()->num_dimensions() >= 3)
{
win_b = window;
}
win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
- Iterator ina(_input0, win_a);
- Iterator inb(_input1, win_b);
- Iterator out(_output, win_out);
+ Iterator ina(src0, win_a);
+ Iterator inb(src1, win_b);
+ Iterator out(dst, win_out);
- switch(_input0->info()->data_type())
+ switch(src0->info()->data_type())
{
case DataType::S8:
case DataType::QASYMM8_SIGNED:
@@ -1003,8 +997,8 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
}
else
{
- const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
- const int width_b = _input1->info()->dimension(0);
+ const size_t in_b_stride = src1->info()->strides_in_bytes()[1];
+ const int width_b = src1->info()->dimension(0);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
Window win_a(window);
@@ -1023,22 +1017,22 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
// The step x and step y for the output matrix has been already set using in configure()
- Iterator ina(_input0, win_a);
- Iterator inb(_input1, win_b);
- Iterator out(_output, window);
+ Iterator ina(src0, win_a);
+ Iterator inb(src1, win_b);
+ Iterator out(dst, window);
- switch(_input0->info()->data_type())
+ switch(src0->info()->data_type())
{
case DataType::S8:
case DataType::QASYMM8_SIGNED:
{
- matrix_multiply_s8(ina, inb, out, width_b, *_output->info(), window);
+ matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window);
break;
}
case DataType::U8:
case DataType::QASYMM8:
{
- matrix_multiply_u8(ina, inb, out, width_b, *_output->info(), window);
+ matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window);
break;
}
default:
@@ -1049,4 +1043,11 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
}
}
}
-} // namespace arm_compute
+
+const char *CpuGemmLowpMatrixMultiplyKernel::name() const
+{
+ return "CpuGemmLowpMatrixMultiplyKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..77d8741b19
--- /dev/null
+++ b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to multiply matrices
+ *
+ * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel
+ * This kernel performs the following computation:
+ *
+ * -# Convert a values from int8 to int32
+ * -# Convert b values from int8 to int32
+ * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
+ */
+class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel
+{
+public:
+ /** Default constructor */
+ CpuGemmLowpMatrixMultiplyKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two
+ * kernels change the layout of the original matrices to be more cache-friendly.
+ *
+ * @param[in] src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+ * @param[in] src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[out] dst Output tensor info to store the result of matrix multiplication. Data type supported: S32
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpMatrixMultiplyKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+private:
+ bool _slide_matrix_b{ true };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
index dfbfbd6fab..270abc8bbd 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/KernelDescriptors.h"
@@ -32,68 +32,80 @@
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- if(output->total_size() > 0)
+ if(dst->total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
}
return Status{};
}
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- if(output->total_size() > 0)
+ if(dst->total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
}
return Status{};
}
} // namespace
-INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
- : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
-{
-}
-
-void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
{
// Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
- ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
- _input = mtx_a;
- _output = vector_sum_row;
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info));
_k = info.k;
_scalar = info.scalar;
_mul_by_scalar = info.mul_by_scalar;
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
+ switch(src->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8:
+ case DataType::QSYMM8_PER_CHANNEL:
+ _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
- Window win = calculate_max_window(*_output->info(), Steps(1));
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32);
- INEKernel::configure(win);
+ Window win = calculate_max_window(*dst, Steps(1));
+ ICpuKernel::configure(win);
}
-Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
return Status{};
}
template <typename T>
-void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
{
// Intermediate and final accumulator types
using TIAcc = wrapper::traits::promote_t<T>;
@@ -106,15 +118,15 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w
win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(_input, win_input);
- Iterator out(_output, collapsed_window);
+ Iterator in(src, win_input);
+ Iterator out(dst, collapsed_window);
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
TAcc sum_row = 0;
- const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -160,36 +172,28 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w
in, out);
}
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- switch(_input->info()->data_type())
- {
- case DataType::QASYMM8:
- run_internal<uint8_t>(window);
- break;
- case DataType::QASYMM8_SIGNED:
- case DataType::QSYMM8:
- case DataType::QSYMM8_PER_CHANNEL:
- run_internal<int8_t>(window);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type");
- }
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ (this->*_func)(src, dst, window);
}
-void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+const char *CpuGemmLowpMatrixAReductionKernel::name() const
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
- ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+ return "CpuGemmLowpMatrixAReductionKernel";
+}
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
- _input = mtx_b;
- _output = vector_sum_col;
_k = info.k;
_scalar = info.scalar;
_mul_by_scalar = info.mul_by_scalar;
@@ -197,24 +201,36 @@ void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 16;
+ switch(src->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8:
+ case DataType::QSYMM8_PER_CHANNEL:
+ _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+
// Output auto initialization if not yet initialized
- auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
+ auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32);
// Configure kernel window
- Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
- INEKernel::configure(win);
+ Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration));
+ ICpuKernel::configure(win);
}
-Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
return Status{};
}
template <typename T>
-void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
{
// Intermediate and final accumulator types
using TIAcc = wrapper::traits::promote_t<T>;
@@ -223,8 +239,8 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
- const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
+ const auto width_matrix_b = static_cast<int>(src->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(src->info()->strides_in_bytes()[1]);
// The implementation computes 16 elements per iteration
const int window_start_x = 16 * info.thread_id;
@@ -239,8 +255,8 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator inb(_input, win_in);
- Iterator out(_output, win_out);
+ Iterator inb(src, win_in);
+ Iterator out(dst, win_out);
execute_window_loop(win_out, [&](const Coordinates & id)
{
@@ -258,7 +274,7 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
};
- const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
+ const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -359,24 +375,22 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
inb, out);
}
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- switch(_input->info()->data_type())
- {
- case DataType::QASYMM8:
- run_internal<uint8_t>(window, info);
- break;
- case DataType::QASYMM8_SIGNED:
- case DataType::QSYMM8:
- case DataType::QSYMM8_PER_CHANNEL:
- run_internal<int8_t>(window, info);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type");
- }
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ (this->*_func)(src, dst, window, info);
+}
+
+const char *CpuGemmLowpMatrixBReductionKernel::name() const
+{
+ return "CpuGemmLowpMatrixBReductionKernel";
}
-} // namespace arm_compute
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
new file mode 100644
index 0000000000..106980fc0b
--- /dev/null
+++ b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+struct GEMMLowpReductionKernelInfo;
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel
+{
+public:
+ /** Default constructor */
+ CpuGemmLowpMatrixAReductionKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] src Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[out] dst Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+ * @param[in] info Kernel metadata:
+ * - k (num_mtx_a_cols) Number of matrix A columns
+ * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4
+ * - scalar Scalar value to multiply each reduced row by.
+ * - mul_byscalar True if each reduced column must be multiplied by a scalar value.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpMatrixAReductionKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+private:
+ /** Execution of the reduction kernel specialized on the input type
+ *
+ * @param[in] src Input tensor
+ * @param[in] dst Output tensor
+ * @param[in] window Execution window
+ */
+ template <typename T>
+ void run_internal(const ITensor *src, ITensor *dst, const Window &window);
+
+ /** Common signature for all reduction functions
+ *
+ * @param[in] src Input tensor
+ * @param[out] dst Output tensor
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+ */
+ using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+
+ CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
+ int32_t _k{ 0 };
+ int32_t _scalar{ 0 };
+ bool _mul_by_scalar{ false };
+};
+
+/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel
+{
+public:
+ /** Default constructor */
+ CpuGemmLowpMatrixBReductionKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel);
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] src Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[out] dst Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+ * @param[in] info Kernel metadata:
+ * - k (num_mtx_b_rows) Number of matrix B rows.
+ * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW.
+ * - scalar Scalar value to multiply each reduced row by.
+ * - mul_byscalar True if each reduced row must be multiplied by a scalar value.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpMatrixBReductionKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+private:
+ /** Execution of the reduction kernel specialized on the input type
+ *
+ * @param[in] src Input tensor
+ * @param[in] dst Output tensor
+ * @param[in] window Execution window
+ * @param[in] info Thread-related information
+ */
+ template <typename T>
+ void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+
+ /** Common signature for all reduction functions
+ *
+ * @param[in] src Input tensor
+ * @param[out] dst Output tensor
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+ */
+ using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+
+ CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
+ int32_t _k{ 0 };
+ int32_t _scalar{ 0 };
+ bool _mul_by_scalar{ false };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
index 867beca0ac..9b1bf08955 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -38,6 +38,10 @@
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
@@ -354,26 +358,16 @@ void run_offset_contribution(const Window &window,
}
} // namespace
-NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
-{
-}
-
-void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
{
// Perform validate step
+ ARM_COMPUTE_UNUSED(vector_sum_row);
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
- a_offset, b_offset)); // NOLINT
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
- _a_offset = a_offset;
- _b_offset = b_offset;
- _k_offset = a_offset * b_offset * k;
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+
+ _a_offset = a_offset;
+ _b_offset = b_offset;
+ _k_offset = a_offset * b_offset * k;
// If a_offset == 0, vector_sum_col can be a nullptr
if(a_offset != 0)
@@ -381,33 +375,43 @@ void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITe
// Check if vector_sum_col_shape should be slidden or not
// Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
+ _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1;
}
// Configure kernel window
- Window win = calculate_max_window(*mm_result->info(), Steps());
- INEKernel::configure(win);
+ Window win = calculate_max_window(*mm_result, Steps());
+ ICpuKernel::configure(win);
}
-Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
- int32_t a_offset, int32_t b_offset)
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
-
return Status{};
}
-void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+ auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto mm_result = tensors.get_tensor(TensorType::ACL_DST);
// Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = _vector_sum_row != nullptr
- && _mm_result->info()->num_dimensions() > 1
- && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
+ const bool reinterpret_as_3d = vector_sum_row != nullptr
+ && mm_result->info()->num_dimensions() > 1
+ && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
- run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+ run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+}
+
+const char *CpuGemmLowpOffsetContributionKernel::name() const
+{
+ return "CpuGemmLowpOffsetContributionKernel";
}
-} // namespace arm_compute
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index f71929fe9e..f23a46cde7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -21,18 +21,21 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H
-#include "src/core/NEON/INEKernel.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
namespace arm_compute
{
-class ITensor;
-
-/** Kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place
*
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
* and adds to it the offset contribution of matrix A and matrix B in-place.
*
* The final result is:
@@ -43,28 +46,15 @@ class ITensor;
* (a_offset * b_offset * k)
*
*/
-class NEGEMMLowpOffsetContributionKernel : public INEKernel
+class CpuGemmLowpOffsetContributionKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "NEGEMMLowpOffsetContributionKernel";
- }
- /** Constructor */
- NEGEMMLowpOffsetContributionKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers)*/
- NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
- /** Default destructor */
- ~NEGEMMLowpOffsetContributionKernel() = default;
+ /** Default constructor */
+ CpuGemmLowpOffsetContributionKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel);
/** Initialise the kernel's input and output.
*
- * @param[in, out] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in, out] mm_result Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -73,33 +63,26 @@ public:
* @param[in] a_offset Offset to be added to each element of the matrix A.
* @param[in] b_offset Offset to be added to each element of the matrix B.
*/
- void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
+ void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
- * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
- * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
- * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
- * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
+ * Similar to CpuGemmLowpOffsetContributionKernel::configure()
*
* @return a status
*/
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
// Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
private:
- const ITensor *_vector_sum_col;
- const ITensor *_vector_sum_row;
- ITensor *_mm_result;
- int32_t _a_offset;
- int32_t _b_offset;
- int32_t _k_offset;
- bool _slide_vector_sum_col;
+ int32_t _a_offset{ 0 };
+ int32_t _b_offset{ 0 };
+ int32_t _k_offset{ 0 };
+ bool _slide_vector_sum_col{ true };
};
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
index dfed7f0bb8..332ce6f013 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -37,12 +37,13 @@
#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <map>
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
@@ -836,53 +837,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *output)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result, Steps());
-
- // Note: This kernel performs 16 elements per iteration.
- // However, since we use a left-over for loop, we cannot have any read or write out of memory
- // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
-
- return std::make_pair(Status{}, win);
-}
} // namespace
-NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
- _output_stage(GEMMLowpOutputStageInfo())
-
-{
-}
-
-void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
- const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
- int32_t k, int32_t a_offset, int32_t b_offset,
- GEMMLowpOutputStageInfo output_stage)
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
+ int32_t k, int32_t a_offset, int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage)
{
+ ARM_COMPUTE_UNUSED(vector_sum_row, bias);
// Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
- bias != nullptr ? bias->info() : nullptr, // NOLINT
- output->info(), a_offset, b_offset, output_stage)); // NOLINT
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _bias = bias;
- _mm_result = mm_result;
- _output = output;
- _a_offset = a_offset;
- _b_offset = b_offset;
- _k_offset = a_offset * b_offset * k;
- _output_stage = output_stage;
+ _a_offset = a_offset;
+ _b_offset = b_offset;
+ _k_offset = a_offset * b_offset * k;
+ _output_stage = output_stage;
// If a_offset == 0, vector_sum_col can be a nullptr
if(a_offset != 0)
@@ -890,40 +860,51 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
// Check if vector_sum_col_shape should be slidden or not
// Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
+ _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1;
}
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
// Configure kernel window
- auto win_config = validate_and_configure_window(mm_result->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ Window win = calculate_max_window(*mm_result, Steps());
+
+ // Note: This kernel performs 16 elements per iteration.
+ // However, since we use a left-over for loop, we cannot have any read or write out of memory
+ // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
+ ICpuKernel::configure(win);
}
-Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
- const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), output->clone().get()).first);
return Status{};
}
-void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+ auto mm_result = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_3);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
PixelValue type_min{};
PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(_output->info()->data_type());
+ std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
int32_t type_min_int = type_min.get<int32_t>();
int32_t type_max_int = type_max.get<int32_t>();
- const bool reinterpret_as_3d = _vector_sum_row != nullptr
- && _mm_result->info()->num_dimensions() > 1
- && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
+ const bool reinterpret_as_3d = vector_sum_row != nullptr
+ && mm_result->info()->num_dimensions() > 1
+ && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
@@ -931,29 +912,35 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, co
const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
// Check if symmetric per-channel execution
- const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED;
+ const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED;
// Check if symmetric per-channel execution
const bool is_symm = _output_stage.is_quantized_per_channel;
if(is_symm)
{
- run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
else
{
if(is_signed)
{
- run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
else
{
- run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
}
}
+const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const
+{
+ return "CpuGemmLowpOffsetContributionOutputStageKernel";
+}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000000..404f2c9496
--- /dev/null
+++ b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel.
+ *
+ * The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
+ * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
+ *
+ * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
+ *
+ * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
+ *
+ * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * and mm_result'[i][k] = mm_result[i][k] +
+ * (vector_sum_col[k] * a_offset) +
+ * (vector_sum_row[i] * b_offset) +
+ * (a_offset * b_offset * k)
+ */
+
+class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel
+{
+public:
+ /** Default constructor */
+ CpuGemmLowpOffsetContributionOutputStageKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel);
+ /** Initialise the kernel inputs and output.
+ *
+ * @param[in] mm_result Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A.
+ * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+ * @param[out] dst Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] k Number of matrix A columns or Matrix B rows
+ * @param[in] a_offset Offset to be added to each element of the matrix A.
+ * @param[in] b_offset Offset to be added to each element of the matrix B.
+ * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
+ */
+ void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
+ int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+ int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+private:
+ /** Function to use for the particular tensors passed to configure() */
+ int32_t _a_offset{ 0 };
+ int32_t _b_offset{ 0 };
+ int32_t _k_offset{ 0 };
+ bool _slide_vector_sum_col{ true };
+ GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index f3cdbdc610..ca5e1b40fc 100644
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -38,7 +38,7 @@ namespace kernels
{
/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Add offset terms to final result
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 7a1197d2cf..e360e65bae 100644
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -38,7 +38,7 @@ namespace kernels
{
/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
*
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 9ebb529990..9c213abdf7 100644
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -38,7 +38,7 @@ namespace kernels
{
/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index 312cad971b..13b30f3427 100644
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -38,7 +38,7 @@ namespace kernels
{
/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
*
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 0aba3c03ec..641a2c2b5f 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -23,660 +23,104 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
-#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+using namespace arm_compute::experimental;
namespace arm_compute
{
-namespace
-{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
-{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
- asm_info.output_stage = info.gemmlowp_output_stage();
-
- return asm_info;
-}
-} // namespace
-
struct NEGEMMLowpMatrixMultiplyCore::Impl
{
- MemoryGroup memory_group{};
- IWeightsManager *weights_manager{ nullptr };
- std::unique_ptr<cpu::CpuGemmAssemblyDispatch> asm_glue{ nullptr };
- std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> mm_kernel{ nullptr };
- std::unique_ptr<cpu::kernels::CpuGemmInterleave4x4Kernel> mtx_a_reshape_kernel{ nullptr };
- std::unique_ptr<cpu::kernels::CpuGemmTranspose1xWKernel> mtx_b_reshape_kernel{ nullptr };
- std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> mtx_a_reduction_kernel{ nullptr };
- std::unique_ptr<NEGEMMLowpMatrixBReductionKernel> mtx_b_reduction_kernel{ nullptr };
- std::unique_ptr<NEGEMMLowpOffsetContributionKernel> offset_contribution_kernel{ nullptr };
- std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> offset_contribution_output_stage_kernel{ nullptr };
- std::unique_ptr<NEActivationLayer> activation_func{ nullptr };
- std::unique_ptr<NEConvertQuantizedSignednessKernel> convert_to_signed_asymm{ nullptr };
- std::unique_ptr<NEConvertQuantizedSignednessKernel> convert_from_signed_asymm{ nullptr };
-
- const ITensor *a_to_use{ nullptr };
- Tensor vector_sum_col{};
- Tensor vector_sum_row{};
- Tensor tmp_a{};
- Tensor tmp_b{};
- Tensor mm_result_s32{};
- Tensor signed_a{};
- Tensor signed_output{};
- const ITensor *original_b{ nullptr };
- int32_t a_offset{ 0 };
- int32_t b_offset{ 0 };
-
- bool run_vector_matrix_multiplication{ false };
- bool assembly_path{ false };
- bool fused_assembly_path{ false };
- bool reshape_b_only_on_first_run{ false };
- bool is_prepared{ false };
- bool fuse_output_stage{ false };
- bool run_activation{ false };
- bool flip_signedness{ false };
-
- experimental::MemoryRequirements aux_mem_req{};
- ITensorPack asm_glue_run_pack{};
- ITensorPack asm_glue_prep_pack{};
- WorkspaceData<Tensor> asm_glue_workspace{};
+ const ITensor *b{ nullptr };
+ std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{ nullptr };
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{ nullptr };
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{ false };
};
-using namespace arm_compute::experimental;
-using namespace arm_compute::misc::shape_calculator;
-
-NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
-
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _impl(std::make_unique<struct NEGEMMLowpMatrixMultiplyCore::Impl>())
+ : _impl(std::make_unique<Impl>())
{
- _impl->memory_group = MemoryGroup(memory_manager);
_impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(c);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- const ITensor *matrix_a = a;
- const ITensor *matrix_b = b;
- GEMMInfo info = gemm_info;
-
- // Set internal variables
- _impl->a_offset = a->info()->quantization_info().uniform().offset;
- _impl->b_offset = b->info()->quantization_info().uniform().offset;
- _impl->run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _impl->reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
- _impl->is_prepared = false;
- _impl->fused_assembly_path = false;
- _impl->flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _impl->reshape_b_only_on_first_run;
- _impl->original_b = b;
-
- _impl->asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-
- _impl->a_to_use = a;
-
- // Convert to QASYMM8 -> QASYMM8_SIGNED and back
- if(_impl->flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = _impl->a_to_use->info()->quantization_info().uniform();
-
- _impl->signed_a.allocator()->init(_impl->a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
- _impl->memory_group.manage(&_impl->signed_a);
- _impl->convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _impl->convert_to_signed_asymm->configure(_impl->a_to_use, &_impl->signed_a);
- _impl->a_to_use = &_impl->signed_a;
- _impl->a_offset = _impl->signed_a.info()->quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
- _impl->memory_group.manage(&_impl->signed_output);
- _impl->signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = _impl->signed_output.info()->quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a = &_impl->signed_a;
- }
-
- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- _impl->fuse_output_stage = true;
- _impl->memory_group.manage(&_impl->mm_result_s32);
- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
- _impl->mm_result_s32.allocator()->init(info_mm_result_s32);
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-#ifdef __aarch64__
- switch(a->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- case DataType::U8:
- case DataType::S8:
- {
- if(is_data_type_quantized_asymmetric(_impl->a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- auto c_info_to_use = c == nullptr ? nullptr : c->info();
- _impl->asm_glue->configure(_impl->a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
- _impl->fused_assembly_path = _impl->asm_glue->is_configured();
- _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_2, c);
- _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output);
- }
- else
- {
- auto output_to_use = (_impl->fuse_output_stage ? &_impl->mm_result_s32 : output);
- _impl->asm_glue->configure(_impl->a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
- _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output_to_use);
- }
- _impl->assembly_path = _impl->asm_glue->is_configured();
-
- if(_impl->assembly_path)
- {
- _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_0, _impl->a_to_use);
-
- _impl->aux_mem_req = _impl->asm_glue->workspace();
- _impl->asm_glue_prep_pack = { { TensorType::ACL_SRC_1, b }, { TensorType::ACL_SRC_2, c } };
-
- _impl->asm_glue_workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->asm_glue_run_pack, _impl->asm_glue_prep_pack);
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Datatype not supported");
- break;
- }
- }
-#endif /* __aarch64__ */
- if(!(_impl->assembly_path || _impl->run_vector_matrix_multiplication))
- {
- matrix_a = &_impl->tmp_a;
- matrix_b = &_impl->tmp_b;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*_impl->a_to_use->info()), 1, _impl->a_to_use->info()->data_type(), _impl->a_to_use->info()->quantization_info());
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
- _impl->tmp_a.allocator()->init(a_info);
- _impl->tmp_b.allocator()->init(b_info);
- _impl->memory_group.manage(&_impl->tmp_a);
- if(!_impl->reshape_b_only_on_first_run)
- {
- _impl->memory_group.manage(&_impl->tmp_b);
- }
-
- // Configure interleave kernel
- _impl->mtx_a_reshape_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
- _impl->mtx_a_reshape_kernel->configure(_impl->a_to_use->info(), _impl->tmp_a.info());
-
- // Configure transpose kernel
- _impl->mtx_b_reshape_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
- _impl->mtx_b_reshape_kernel->configure(b->info(), _impl->tmp_b.info());
- }
-
- if(!_impl->fused_assembly_path)
- {
- // Build reduction info
- const GEMMLowpReductionKernelInfo reduction_info(_impl->a_to_use->info()->dimension(0), false, 0, false);
-
- // Initialize matrix B reduction kernel only if _impl->a_offset is not equal to 0
- if(_impl->a_offset != 0)
- {
- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
- _impl->vector_sum_col.allocator()->init(info_vector_sum_col);
- if(!_impl->reshape_b_only_on_first_run)
- {
- _impl->memory_group.manage(&_impl->vector_sum_col);
- }
-
- // Configure Matrix B reduction kernel
- _impl->mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>();
- _impl->mtx_b_reduction_kernel->configure(b, &_impl->vector_sum_col, reduction_info);
- }
-
- // Initialize Matrix A reduction kernel only if _impl->b_offset is not equal to 0
- if(_impl->b_offset != 0)
- {
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*_impl->a_to_use->info()), 1, DataType::S32);
-
- _impl->vector_sum_row.allocator()->init(info_vector_sum_row);
- _impl->memory_group.manage(&_impl->vector_sum_row);
-
- // Configure matrix A reduction kernel
- _impl->mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _impl->mtx_a_reduction_kernel->configure(_impl->a_to_use, &_impl->vector_sum_row, reduction_info);
- }
-
- if(_impl->fuse_output_stage)
- {
- // Configure matrix multiply kernel
- if(!_impl->assembly_path)
- {
- _impl->mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _impl->mm_kernel->configure(matrix_a, matrix_b, &_impl->mm_result_s32);
- }
-
- _impl->offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
- _impl->offset_contribution_output_stage_kernel->configure(&_impl->mm_result_s32,
- _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col,
- _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row, c,
- _impl->flip_signedness ? &_impl->signed_output : output,
- a->info()->dimension(0),
- _impl->a_offset, _impl->b_offset, info.gemmlowp_output_stage());
-
- if(_impl->flip_signedness)
- {
- _impl->convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _impl->convert_from_signed_asymm->configure(&_impl->signed_output, output);
- }
- }
- else
- {
- // Configure matrix multiply kernel
- if(!_impl->assembly_path)
- {
- _impl->mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _impl->mm_kernel->configure(matrix_a, matrix_b, output);
- }
- // Configure offset contribution kernel
- _impl->offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>();
- _impl->offset_contribution_kernel->configure(output, _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col, _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row,
- _impl->a_to_use->info()->dimension(0),
- _impl->a_offset, _impl->b_offset);
- }
- }
- // Configure activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- _impl->run_activation = activation.enabled() && (!_impl->assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
- if(_impl->run_activation)
- {
- _impl->activation_func = std::make_unique<NEActivationLayer>();
- _impl->activation_func->configure(output, nullptr, activation);
- }
-
- // Allocate tensors
- if(!_impl->assembly_path && !_impl->run_vector_matrix_multiplication)
- {
- _impl->tmp_a.allocator()->allocate();
- if(!_impl->reshape_b_only_on_first_run)
- {
- _impl->tmp_b.allocator()->allocate();
- }
- }
-
- if(!_impl->fused_assembly_path)
- {
- if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run)
- {
- _impl->vector_sum_col.allocator()->allocate();
- }
-
- if(_impl->b_offset != 0)
- {
- _impl->vector_sum_row.allocator()->allocate();
- }
- }
-
- if(_impl->fuse_output_stage)
- {
- _impl->mm_result_s32.allocator()->allocate();
- }
-
- if(_impl->flip_signedness)
- {
- _impl->signed_a.allocator()->allocate();
- _impl->signed_output.allocator()->allocate();
- }
+ _impl->b = b;
+ _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+ _impl->op->configure(a->info(), b->info(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info);
+ _impl->run_pack =
+ {
+ { TensorType::ACL_SRC_0, a },
+ { TensorType::ACL_SRC_1, b },
+ { TensorType::ACL_SRC_2, c },
+ { TensorType::ACL_DST, output }
+ };
+ _impl->prep_pack =
+ {
+ { TensorType::ACL_SRC_1, b },
+ { TensorType::ACL_SRC_2, c }
+ };
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- GEMMInfo info = gemm_info;
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- const ITensorInfo *a_to_use = a;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo mm_result_s32_info{};
-
- int32_t a_offset = a->quantization_info().uniform().offset;
- int32_t b_offset = b->quantization_info().uniform().offset;
-
- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if(fuse_output_stage)
- {
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
- }
-
- // Convert QASYMM8->QASYMM8_SIGNED
- TensorInfo signed_a{};
- TensorInfo signed_output{};
- bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
- if(flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
-
- signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
- a_to_use = &signed_a;
- a_offset = signed_a.quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
- signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a_info = &signed_a;
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(info);
-
- // Check if we need to run the optimized assembly kernel
- bool run_optimised = false;
- bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
- run_optimised_requantized = run_optimised;
- }
- else
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
- }
-
- if(run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(info.depth_output_gemm3d() != 0)
- {
- if(info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if(!run_vector_matrix_multiplication)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
- }
- }
-
- if(!run_optimised_requantized)
- {
- TensorInfo info_vector_sum_col{};
- TensorInfo info_vector_sum_row{};
-
- const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
- {
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
- // Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
- }
-
- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
- {
- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
- // Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
- }
-
- if(fuse_output_stage)
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
- }
-
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- flip_signedness ? &signed_output : output,
- a_offset, b_offset,
- info.gemmlowp_output_stage()));
- }
- else
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
- }
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
- }
- }
-
- // Validate activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
- }
-
- return Status{};
+ return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info);
}
void NEGEMMLowpMatrixMultiplyCore::run()
{
prepare();
-
MemoryGroupResourceScope scope_mg(_impl->memory_group);
-
- // Convert QASYMM8->QASYMM8_SIGNED
- if(_impl->flip_signedness)
- {
- NEScheduler::get().schedule(_impl->convert_to_signed_asymm.get(), Window::DimY);
- }
-
- // Run GEMM
- if(_impl->asm_glue->is_configured())
- {
- _impl->asm_glue->run(_impl->asm_glue_run_pack);
- }
- else
- {
- if(!_impl->run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- ITensorPack interleave_pack{ { ACL_SRC, _impl->a_to_use }, { ACL_DST, &_impl->tmp_a } };
- NEScheduler::get().schedule_op(_impl->mtx_a_reshape_kernel.get(), Window::DimY, _impl->mtx_a_reshape_kernel->window(), interleave_pack);
-
- if(!_impl->reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- ITensorPack reshape_b_pack{ { ACL_SRC, _impl->original_b }, { ACL_DST, &_impl->tmp_b } };
- NEScheduler::get().schedule_op(_impl->mtx_b_reshape_kernel.get(), Window::DimY, _impl->mtx_b_reshape_kernel->window(), reshape_b_pack);
- }
- }
- NEScheduler::get().schedule(_impl->mm_kernel.get(), Window::DimY);
- }
-
- if(!_impl->fused_assembly_path)
- {
- // Run matrix A reduction kernel only if _impl->b_offset is not equal to 0
- if(_impl->b_offset != 0)
- {
- NEScheduler::get().schedule(_impl->mtx_a_reduction_kernel.get(), Window::DimX);
- }
-
- // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0
- if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run)
- {
- NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX);
- }
-
- if(_impl->fuse_output_stage)
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_impl->offset_contribution_output_stage_kernel.get(), Window::DimY);
- }
- else
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_impl->offset_contribution_kernel.get(), Window::DimY);
- }
- }
-
- // Convert QASYMM8_SIGNED->QASYMM8
- if(!_impl->fused_assembly_path && _impl->fuse_output_stage && _impl->flip_signedness)
- {
- NEScheduler::get().schedule(_impl->convert_from_signed_asymm.get(), Window::DimY);
- }
-
- // Run fused activation unless already run in the fused assembly
- if(_impl->run_activation)
- {
- _impl->activation_func->run();
- }
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
{
if(!_impl->is_prepared)
{
- // Run assembly reshape
- if(_impl->asm_glue->is_configured())
- {
- _impl->asm_glue->prepare(_impl->asm_glue_prep_pack);
+ _impl->op->prepare(_impl->prep_pack);
- auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
- _impl->aux_mem_req.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+ auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
+ _impl->aux_mem_req.end(),
+ [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- if(has_reshape != std::end(_impl->aux_mem_req))
- {
- _impl->original_b->mark_as_unused();
- }
- else
- {
- _impl->asm_glue_run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b);
- }
- }
- // Run non-assembly reshape
- else if(_impl->reshape_b_only_on_first_run && !_impl->run_vector_matrix_multiplication && !_impl->asm_glue->is_configured())
+ if(has_reshape != std::end(_impl->aux_mem_req))
{
- // Run reshape kernel and mark original weights tensor as unused
- _impl->tmp_b.allocator()->allocate();
- ITensorPack reshape_b_pack{ { ACL_SRC, _impl->original_b }, { ACL_DST, &_impl->tmp_b } };
- NEScheduler::get().schedule_op(_impl->mtx_b_reshape_kernel.get(), Window::DimY, _impl->mtx_b_reshape_kernel->window(), reshape_b_pack);
+ _impl->b->mark_as_unused();
}
- // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0
- if(!_impl->fused_assembly_path && _impl->a_offset != 0 && _impl->reshape_b_only_on_first_run)
+ // Release temporary tensors that are only used in prepare stage
+ for(auto &ws : _impl->workspace_tensors)
{
- _impl->vector_sum_col.allocator()->allocate();
- NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX);
+ const int slot = ws.first;
+ for(auto &m : _impl->aux_mem_req)
+ {
+ if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
+ {
+ auto tensor = ws.second.get();
+ tensor->allocator()->free();
+ break;
+ }
+ }
}
-
_impl->is_prepared = true;
}
}
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index f3a3d23256..946791a104 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/QuantizationInfo.h"
#include "arm_compute/core/Utils.h"
@@ -30,12 +31,8 @@
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
@@ -223,29 +220,29 @@ void NEQLSTMLayer::configure(const ITensor *input,
_input_to_input_weights = lstm_params.input_to_input_weights();
_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
- _input_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
}
- _input_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-
- _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+
+ _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
if(_has_projection)
{
- _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
if(_projection_bias != nullptr)
{
_projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -658,21 +655,26 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
if(!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+ -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+ -qoutput_state_in.offset,
+ true)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+ -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
+ true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+ -qoutput_state_in.offset, true)));
if(lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
- lstm_params.hidden_state_zero(),
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
+ lstm_params.hidden_state_zero(),
+ true)));
if(lstm_params.projection_bias() != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
@@ -1107,8 +1109,20 @@ void NEQLSTMLayer::prepare()
{
_input_to_input_eff_bias.allocator()->allocate();
_recurrent_to_input_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
+
+ ITensorPack packII =
+ {
+ { TensorType::ACL_SRC, _input_to_input_weights },
+ { TensorType::ACL_DST, &_input_to_input_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII);
+
+ ITensorPack packRI =
+ {
+ { TensorType::ACL_SRC, _recurrent_to_input_weights },
+ { TensorType::ACL_DST, &_recurrent_to_input_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI);
_input_to_input_weights_transposed.allocator()->allocate();
_recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1123,17 +1137,58 @@ void NEQLSTMLayer::prepare()
_recurrent_to_cell_eff_bias.allocator()->allocate();
_input_to_output_eff_bias.allocator()->allocate();
_recurrent_to_output_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
+
+ ITensorPack packIF =
+ {
+ { TensorType::ACL_SRC, _input_to_forget_weights },
+ { TensorType::ACL_DST, &_input_to_forget_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF);
+
+ ITensorPack packRF =
+ {
+ { TensorType::ACL_SRC, _recurrent_to_forget_weights },
+ { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF);
+
+ ITensorPack packIC =
+ {
+ { TensorType::ACL_SRC, _input_to_cell_weights },
+ { TensorType::ACL_DST, &_input_to_cell_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC);
+
+ ITensorPack packRC =
+ {
+ { TensorType::ACL_SRC, _recurrent_to_cell_weights },
+ { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC);
+
+ ITensorPack packIO =
+ {
+ { TensorType::ACL_SRC, _input_to_output_weights },
+ { TensorType::ACL_DST, &_input_to_output_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO);
+
+ ITensorPack packRO =
+ {
+ { TensorType::ACL_SRC, _recurrent_to_output_weights },
+ { TensorType::ACL_DST, &_recurrent_to_output_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO);
if(_has_projection)
{
_projection_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, _projection_weights },
+ { TensorType::ACL_DST, &_projection_eff_bias }
+ };
+ NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack);
if(_projection_bias != nullptr)
{
_projection_bias_add.run();
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..651ce436a0
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+
+#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
+#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+ cpu::AsmGemmInfo asm_info;
+ asm_info.method = cpu::AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.output_stage = info.gemmlowp_output_stage();
+
+ return asm_info;
+}
+} // namespace
+
+CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
+ : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
+ _mm_kernel(),
+ _mtx_a_reshape_kernel(),
+ _mtx_b_reshape_kernel(),
+ _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(),
+ _offset_contribution_kernel(),
+ _offset_contribution_output_stage_kernel(),
+ _activation_func(),
+ _convert_to_signed_asymm(),
+ _convert_from_signed_asymm(),
+ _vector_sum_col(),
+ _vector_sum_row(),
+ _tmp_a(),
+ _tmp_b(),
+ _mm_result_s32(),
+ _signed_a(),
+ _signed_output(),
+ _a_offset(0),
+ _b_offset(0),
+ _run_vector_matrix_multiplication(false),
+ _assembly_path(false),
+ _fused_assembly_path(false),
+ _reshape_b_only_on_first_run(false),
+ _is_prepared(false),
+ _fuse_output_stage(false),
+ _run_activation(false),
+ _flip_signedness(false),
+ _gemm_info(),
+ _aux_mem(Count)
+{
+}
+CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
+
+void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
+
+ const ITensorInfo *matrix_a = a;
+ const ITensorInfo *matrix_b = b;
+ GEMMInfo info = gemm_info;
+
+ // Set internal variables
+ _a_offset = a->quantization_info().uniform().offset;
+ _b_offset = b->quantization_info().uniform().offset;
+ _run_vector_matrix_multiplication = a->dimension(1) < 2;
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _fused_assembly_path = false;
+ _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
+ _gemm_info = gemm_info;
+
+ _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+
+ const ITensorInfo *a_to_use = a;
+
+ // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+ if(_flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+ _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
+ a_to_use = &_signed_a;
+ _a_offset = _signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+ _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a = &_signed_a;
+ }
+
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+ _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
+ }
+
+ // Initialize assembly kernel meta-data
+ const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+#ifdef __aarch64__
+ switch(a->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::U8:
+ case DataType::S8:
+ {
+ if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ auto c_info_to_use = c == nullptr ? nullptr : c;
+ _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
+ _fused_assembly_path = _asm_glue->is_configured();
+ }
+ else
+ {
+ auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
+ _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
+ }
+ _assembly_path = _asm_glue->is_configured();
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
+#endif /* __aarch64__ */
+ if(!(_assembly_path || _run_vector_matrix_multiplication))
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
+
+ // Configure interleave kernel
+ _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
+ _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
+
+ // Configure transpose kernel
+ _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
+ _mtx_b_reshape_kernel->configure(b, &_tmp_b);
+ }
+
+ if(!_fused_assembly_path)
+ {
+ // Build reduction info
+ const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
+ _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
+ }
+
+ if(_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if(!_assembly_path)
+ {
+ _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
+ }
+
+ _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+ _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : dst,
+ a->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+ if(_flip_signedness)
+ {
+ _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+ _convert_from_signed_asymm->configure(&_signed_output, dst);
+ }
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if(!_assembly_path)
+ {
+ _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, dst);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
+ _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+ _a_offset, _b_offset);
+ }
+ }
+ // Configure activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+ if(_run_activation)
+ {
+ _activation_func = std::make_unique<CpuActivation>();
+ _activation_func->configure(dst, nullptr, activation);
+ }
+
+ if(_assembly_path)
+ {
+ auto asm_mem_req = _asm_glue->workspace();
+ _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
+ _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
+ }
+
+ // Request memory for LHS and RHS reshape matrix
+ _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
+ && _reshape_b_only_on_first_run ?
+ MemoryLifetime::Persistent :
+ MemoryLifetime::Temporary,
+ _vector_sum_col.total_size());
+ _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+ _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+ _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+ _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+ _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+}
+
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ GEMMInfo info = gemm_info;
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ const ITensorInfo *a_to_use = a;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
+ int32_t a_offset = a->quantization_info().uniform().offset;
+ int32_t b_offset = b->quantization_info().uniform().offset;
+
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if(fuse_output_stage)
+ {
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
+ // Convert QASYMM8->QASYMM8_SIGNED
+ TensorInfo signed_a{};
+ TensorInfo signed_output{};
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if(flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+ a_to_use = &signed_a;
+ a_offset = signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a_info = &signed_a;
+ }
+
+ // Initialize assembly kernel meta-data
+ const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
+ // Check if we need to run the optimized assembly kernel
+ bool run_optimised = false;
+ bool run_optimised_requantized = false;
+ if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
+ run_optimised_requantized = run_optimised;
+ }
+ else
+ {
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+ }
+
+ if(run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(info.depth_output_gemm3d() != 0)
+ {
+ if(info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ if(!run_vector_matrix_multiplication)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+ }
+
+ if(!run_optimised_requantized)
+ {
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
+
+ const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if(a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+ }
+
+ if(fuse_output_stage)
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ flip_signedness ? &signed_output : output,
+ a_offset, b_offset,
+ info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+ }
+ }
+
+ // Validate activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(activation.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
+ }
+
+ return Status{};
+}
+
+void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+ auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+ auto a_to_use = a;
+ auto matrix_a = a;
+ auto matrix_b = b;
+
+ CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
+ CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
+ CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
+ CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
+ CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
+ CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
+ CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
+
+ // Convert QASYMM8->QASYMM8_SIGNED
+ if(_flip_signedness)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, a },
+ { TensorType::ACL_DST, signed_a.get() }
+ };
+ NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+ a_to_use = signed_a.get();
+ }
+
+ // Run GEMM
+ if(_asm_glue->is_configured())
+ {
+ ITensorPack asm_glue_tensors = tensors;
+ auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
+ if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
+ asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
+ }
+ else
+ {
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+ asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
+ }
+ _asm_glue->run(asm_glue_tensors);
+ }
+ else
+ {
+ if(!_run_vector_matrix_multiplication)
+ {
+ matrix_a = tmp_a.get();
+ matrix_b = tmp_b.get();
+ // Run interleave kernel
+ ITensorPack pack_a =
+ {
+ { TensorType::ACL_SRC, a_to_use },
+ { TensorType::ACL_DST, tmp_a.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ ITensorPack pack_b =
+ {
+ { TensorType::ACL_SRC, b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ // Run transpose kernel
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+ }
+ }
+ ITensorPack pack_mm =
+ {
+ { TensorType::ACL_SRC_0, matrix_a },
+ { TensorType::ACL_SRC_1, matrix_b }
+ };
+ if(_fuse_output_stage)
+ {
+ pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
+ }
+ else
+ {
+ pack_mm.add_tensor(TensorType::ACL_DST, dst);
+ }
+ NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
+ }
+
+ if(!_fused_assembly_path)
+ {
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, a_to_use },
+ { TensorType::ACL_DST, vector_sum_row.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, b },
+ { TensorType::ACL_DST, vector_sum_col.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ }
+
+ if(_fuse_output_stage)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
+ pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
+ pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
+ pack.add_tensor(TensorType::ACL_SRC_3, c);
+ pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
+
+ // Run offset contribution kernel
+ NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+ }
+ else
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
+ pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
+ pack.add_tensor(TensorType::ACL_DST, dst);
+
+ // Run offset contribution kernel
+ NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+ }
+ }
+
+ // Convert QASYMM8_SIGNED->QASYMM8
+ if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, signed_output.get() },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+ }
+
+ // Run fused activation unless already run in the fused assembly
+ if(_run_activation)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, dst },
+ { TensorType::ACL_DST, dst }
+ };
+ _activation_func->run(pack);
+ }
+}
+
+void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ // Run assembly reshape
+ if(_asm_glue->is_configured())
+ {
+ _asm_glue->prepare(tensors);
+
+ auto has_reshape = std::find_if(_aux_mem.begin(),
+ _aux_mem.end(),
+ [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+ if(has_reshape != std::end(_aux_mem))
+ {
+ original_b->mark_as_unused();
+ }
+ }
+ // Run non-assembly reshape
+ else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ {
+ // Run reshape kernel and mark original weights tensor as unused
+ ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+ CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, original_b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+ CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, original_b },
+ { TensorType::ACL_DST, vector_sum_col.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ }
+ _is_prepared = true;
+ }
+}
+experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..1d0e470559
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmInterleave4x4Kernel;
+class CpuGemmLowpMatrixMultiplyKernel;
+class CpuGemmLowpOffsetContributionKernel;
+class CpuGemmLowpOffsetContributionOutputStageKernel;
+class CpuGemmLowpMatrixAReductionKernel;
+class CpuGemmLowpMatrixBReductionKernel;
+class CpuGemmTranspose1xWKernel;
+class CpuConvertQuantizedSignednessKernel;
+} // namespace kernels
+class CpuGemmAssemblyDispatch;
+class CpuActivation;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
+ *
+ * -# @ref kernels::CpuGemmInterleave4x4Kernel
+ * -# @ref kernels::CpuGemmTranspose1xWKernel
+ * -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel
+ * -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ * -# @ref CpuActivation
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ * -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ *
+*/
+class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuGemmLowpMatrixMultiplyCore();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore);
+ /** Destructor */
+ ~CpuGemmLowpMatrixMultiplyCore();
+ /** Initialise the kernel's inputs, output
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QASYMM8 |S32 |S32 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8 |QSYMM8 |S32 |S32 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 |
+ *
+ * @note GEMM_LOWP: low precision GEMM kernel
+ * This kernel performs the following computations:
+ *
+ * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+ * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+ * -# Compute the matrix product of the resulting a * b in int32.
+ *
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+ *
+ * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+ * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
+ * @param[out] dst Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+ * if the reshape of matrix B should be executed only for the first run
+ */
+ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretranspose,
+ VectorSumCol,
+ VectorSumRow,
+ TmpA,
+ TmpB,
+ MMResultS32,
+ SignedA,
+ SignedOutput,
+ Count
+ };
+
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel> _mm_kernel;
+ std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _mtx_a_reshape_kernel;
+ std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _mtx_b_reshape_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel> _offset_contribution_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_to_signed_asymm;
+ std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_from_signed_asymm;
+
+ TensorInfo _vector_sum_col;
+ TensorInfo _vector_sum_row;
+ TensorInfo _tmp_a;
+ TensorInfo _tmp_b;
+ TensorInfo _mm_result_s32;
+ TensorInfo _signed_a;
+ TensorInfo _signed_output;
+ int32_t _a_offset;
+ int32_t _b_offset;
+
+ bool _run_vector_matrix_multiplication;
+ bool _assembly_path;
+ bool _fused_assembly_path;
+ bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
+ bool _fuse_output_stage;
+ bool _run_activation;
+ bool _flip_signedness;
+ GEMMInfo _gemm_info;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
diff --git a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
index 152e3c6c04..1cf717cf6f 100644
--- a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
@@ -41,6 +41,10 @@ public:
CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
: _tensor()
{
+ if(info.total_size() == 0)
+ {
+ return;
+ }
_tensor.allocator()->soft_init(info);
ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 5d87330982..1941586d5c 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -26,6 +26,8 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
#include "tests/NEON/Accessor.h"
#include "tests/NEON/Helper.h"
#include "tests/PaddingCalculator.h"
@@ -109,6 +111,105 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
// clang-format on
// *INDENT-ON*
+/** Test case for memory injection in @ref cpu::CpuGemmLowpMatrixMultiplyCore.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+ auto gemm = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+ auto a_info = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+ auto b_info = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+ auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+ a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+ b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+ const auto gemm_info = GEMMInfo{};
+ gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);
+
+ // telhs are newly created every call of this lambda function
+ auto a = create_tensor<Tensor>(a_info);
+ auto b = create_tensor<Tensor>(b_info);
+ auto dst = create_tensor<Tensor>(dst_info);
+ a.allocator()->allocate();
+ b.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ ITensorPack run_pack =
+ {
+ { TensorType::ACL_SRC_0, &a },
+ { TensorType::ACL_SRC_1, &b },
+ { TensorType::ACL_DST, &dst }
+ };
+ ITensorPack prep_pack =
+ {
+ { TensorType::ACL_SRC_1, &b },
+ };
+
+ auto mg = MemoryGroup{};
+ auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+ auto run_conv = [&]() -> Tensor
+ {
+ auto dst = create_tensor<Tensor>(dst_info);
+ dst.allocator()->allocate();
+ run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+ library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+ library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+ // This operator is configured once and captured by this lambda.
+ gemm->prepare(prep_pack);
+ gemm->run(run_pack);
+ return dst;
+ };
+ auto result_0 = run_conv();
+ auto result_1 = run_conv();
+ for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+ {
+ ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+ }
+}
+
+/** Test case for memory injection in @ref NEGEMMLowpMatrixMultiplyCore.
+ *
+ * Make sure @ref NEGEMMLowpMatrixMultiplyCore still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+ auto gemm = std::make_unique<NEGEMMLowpMatrixMultiplyCore>();
+ auto a_info = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+ auto b_info = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+ auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+ a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+ b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+ const auto gemm_info = GEMMInfo{};
+ auto run_conv = [&]()
+ {
+ auto a = create_tensor<Tensor>(a_info);
+ auto b = create_tensor<Tensor>(b_info);
+ auto dst = create_tensor<Tensor>(dst_info);
+ gemm->configure(&a, &b, nullptr, &dst, gemm_info);
+ a.allocator()->allocate();
+ b.allocator()->allocate();
+ dst.allocator()->allocate();
+ library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+ library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+ gemm->run();
+ return dst;
+ };
+ auto result_0 = run_conv();
+ auto result_1 = run_conv();
+ for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+ {
+ ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+ }
+}
+
FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
{
// Validate output