aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/Allocator.cpp2
-rw-r--r--src/runtime/BlobLifetimeManager.cpp33
-rw-r--r--src/runtime/BlobMemoryPool.cpp6
-rw-r--r--src/runtime/CL/CLBufferAllocator.cpp3
-rw-r--r--src/runtime/CL/CLGEMMHeuristicsHandle.cpp3
-rw-r--r--src/runtime/CL/CLHelpers.cpp41
-rw-r--r--src/runtime/CL/CLMemory.cpp12
-rw-r--r--src/runtime/CL/CLMemoryRegion.cpp51
-rw-r--r--src/runtime/CL/CLOperator.cpp5
-rw-r--r--src/runtime/CL/CLRuntimeContext.cpp6
-rw-r--r--src/runtime/CL/CLScheduler.cpp85
-rw-r--r--src/runtime/CL/CLSubTensor.cpp18
-rw-r--r--src/runtime/CL/CLTensorAllocator.cpp40
-rw-r--r--src/runtime/CL/CLTuner.cpp154
-rw-r--r--src/runtime/CL/ICLSimpleFunction.cpp5
-rw-r--r--src/runtime/CL/Utils.cpp16
-rw-r--r--src/runtime/CL/functions/CLActivationLayer.cpp27
-rw-r--r--src/runtime/CL/functions/CLArgMinMaxLayer.cpp149
-rw-r--r--src/runtime/CL/functions/CLBatchNormalizationLayer.cpp38
-rw-r--r--src/runtime/CL/functions/CLBatchToSpaceLayer.cpp41
-rw-r--r--src/runtime/CL/functions/CLBitwiseAnd.cpp11
-rw-r--r--src/runtime/CL/functions/CLBitwiseNot.cpp6
-rw-r--r--src/runtime/CL/functions/CLBitwiseOr.cpp11
-rw-r--r--src/runtime/CL/functions/CLBitwiseXor.cpp11
-rw-r--r--src/runtime/CL/functions/CLBoundingBoxTransform.cpp21
-rw-r--r--src/runtime/CL/functions/CLCast.cpp23
-rw-r--r--src/runtime/CL/functions/CLChannelShuffleLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLComparison.cpp38
-rw-r--r--src/runtime/CL/functions/CLConcatenateLayer.cpp29
-rw-r--r--src/runtime/CL/functions/CLConv3D.cpp97
-rw-r--r--src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp35
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp288
-rw-r--r--src/runtime/CL/functions/CLCopy.cpp16
-rw-r--r--src/runtime/CL/functions/CLCrop.cpp49
-rw-r--r--src/runtime/CL/functions/CLCropResize.cpp195
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayer.cpp129
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp18
-rw-r--r--src/runtime/CL/functions/CLDepthConvertLayer.cpp27
-rw-r--r--src/runtime/CL/functions/CLDepthToSpaceLayer.cpp9
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp559
-rw-r--r--src/runtime/CL/functions/CLDequantizationLayer.cpp18
-rw-r--r--src/runtime/CL/functions/CLDirectConvolutionLayer.cpp49
-rw-r--r--src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp86
-rw-r--r--src/runtime/CL/functions/CLElementwiseOperations.cpp212
-rw-r--r--src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp80
-rw-r--r--src/runtime/CL/functions/CLFFT1D.cpp31
-rw-r--r--src/runtime/CL/functions/CLFFT2D.cpp17
-rw-r--r--src/runtime/CL/functions/CLFFTConvolutionLayer.cpp127
-rw-r--r--src/runtime/CL/functions/CLFill.cpp19
-rw-r--r--src/runtime/CL/functions/CLFillBorder.cpp42
-rw-r--r--src/runtime/CL/functions/CLFlattenLayer.cpp26
-rw-r--r--src/runtime/CL/functions/CLFloor.cpp14
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp483
-rw-r--r--src/runtime/CL/functions/CLFuseBatchNormalization.cpp58
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp73
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp696
-rw-r--r--src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp205
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp685
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp187
-rw-r--r--src/runtime/CL/functions/CLGather.cpp11
-rw-r--r--src/runtime/CL/functions/CLGenerateProposalsLayer.cpp191
-rw-r--r--src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp100
-rw-r--r--src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp35
-rw-r--r--src/runtime/CL/functions/CLL2NormalizeLayer.cpp12
-rw-r--r--src/runtime/CL/functions/CLLSTMLayer.cpp578
-rw-r--r--src/runtime/CL/functions/CLLSTMLayerQuantized.cpp428
-rw-r--r--src/runtime/CL/functions/CLLogicalAnd.cpp29
-rw-r--r--src/runtime/CL/functions/CLLogicalNot.cpp16
-rw-r--r--src/runtime/CL/functions/CLLogicalOr.cpp29
-rw-r--r--src/runtime/CL/functions/CLMatMul.cpp87
-rw-r--r--src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp22
-rw-r--r--src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLNormalizationLayer.cpp21
-rw-r--r--src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp21
-rw-r--r--src/runtime/CL/functions/CLPReluLayer.cpp27
-rw-r--r--src/runtime/CL/functions/CLPadLayer.cpp44
-rw-r--r--src/runtime/CL/functions/CLPermute.cpp21
-rw-r--r--src/runtime/CL/functions/CLPixelWiseMultiplication.cpp84
-rw-r--r--src/runtime/CL/functions/CLPooling3dLayer.cpp77
-rw-r--r--src/runtime/CL/functions/CLPoolingLayer.cpp33
-rw-r--r--src/runtime/CL/functions/CLPriorBoxLayer.cpp37
-rw-r--r--src/runtime/CL/functions/CLQLSTMLayer.cpp986
-rw-r--r--src/runtime/CL/functions/CLQuantizationLayer.cpp12
-rw-r--r--src/runtime/CL/functions/CLRNNLayer.cpp59
-rw-r--r--src/runtime/CL/functions/CLROIAlignLayer.cpp22
-rw-r--r--src/runtime/CL/functions/CLROIPoolingLayer.cpp21
-rw-r--r--src/runtime/CL/functions/CLRange.cpp8
-rw-r--r--src/runtime/CL/functions/CLReduceMean.cpp109
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp64
-rw-r--r--src/runtime/CL/functions/CLRemap.cpp51
-rw-r--r--src/runtime/CL/functions/CLReorgLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLReshapeLayer.cpp16
-rw-r--r--src/runtime/CL/functions/CLReverse.cpp24
-rw-r--r--src/runtime/CL/functions/CLScale.cpp17
-rw-r--r--src/runtime/CL/functions/CLScatter.cpp93
-rw-r--r--src/runtime/CL/functions/CLSelect.cpp11
-rw-r--r--src/runtime/CL/functions/CLSlice.cpp44
-rw-r--r--src/runtime/CL/functions/CLSoftmaxLayer.cpp28
-rw-r--r--src/runtime/CL/functions/CLSpaceToBatchLayer.cpp73
-rw-r--r--src/runtime/CL/functions/CLSpaceToDepthLayer.cpp13
-rw-r--r--src/runtime/CL/functions/CLSplit.cpp3
-rw-r--r--src/runtime/CL/functions/CLStackLayer.cpp24
-rw-r--r--src/runtime/CL/functions/CLStridedSlice.cpp84
-rw-r--r--src/runtime/CL/functions/CLTile.cpp9
-rw-r--r--src/runtime/CL/functions/CLTranspose.cpp14
-rw-r--r--src/runtime/CL/functions/CLUnstack.cpp43
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp76
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp271
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp40
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp304
-rw-r--r--src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h5
-rw-r--r--src/runtime/CL/gemm/CLGEMMKernelSelection.h12
-rw-r--r--src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp84
-rw-r--r--src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h7
-rw-r--r--src/runtime/CL/mlgo/Common.h40
-rw-r--r--src/runtime/CL/mlgo/HeuristicTree.cpp89
-rw-r--r--src/runtime/CL/mlgo/HeuristicTree.h24
-rw-r--r--src/runtime/CL/mlgo/MLGOHeuristics.cpp99
-rw-r--r--src/runtime/CL/mlgo/MLGOHeuristics.h6
-rw-r--r--src/runtime/CL/mlgo/MLGOParser.cpp188
-rw-r--r--src/runtime/CL/mlgo/MLGOParser.h9
-rw-r--r--src/runtime/CL/mlgo/Utils.cpp48
-rw-r--r--src/runtime/CL/mlgo/Utils.h10
-rw-r--r--src/runtime/CL/tuners/CLTuningParametersList.cpp77
-rw-r--r--src/runtime/CPP/CPPScheduler.cpp133
-rw-r--r--src/runtime/CPP/SingleThreadScheduler.cpp17
-rw-r--r--src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp159
-rw-r--r--src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp315
-rw-r--r--src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp418
-rw-r--r--src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp27
-rw-r--r--src/runtime/CPP/functions/CPPPermute.cpp6
-rw-r--r--src/runtime/CPP/functions/CPPTopKV.cpp11
-rw-r--r--src/runtime/CPP/functions/CPPUpsample.cpp6
-rw-r--r--src/runtime/IScheduler.cpp99
-rw-r--r--src/runtime/ISimpleLifetimeManager.cpp27
-rw-r--r--src/runtime/IWeightsManager.cpp70
-rw-r--r--src/runtime/Memory.cpp9
-rw-r--r--src/runtime/MemoryManagerOnDemand.cpp5
-rw-r--r--src/runtime/NEON/INEOperator.cpp7
-rw-r--r--src/runtime/NEON/INESimpleFunction.cpp4
-rw-r--r--src/runtime/NEON/INESimpleFunctionNoBorder.cpp5
-rw-r--r--src/runtime/NEON/functions/NEActivationLayer.cpp19
-rw-r--r--src/runtime/NEON/functions/NEAddMulAdd.cpp89
-rw-r--r--src/runtime/NEON/functions/NEArgMinMaxLayer.cpp55
-rw-r--r--src/runtime/NEON/functions/NEArithmeticAddition.cpp28
-rw-r--r--src/runtime/NEON/functions/NEArithmeticSubtraction.cpp28
-rw-r--r--src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp29
-rw-r--r--src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp21
-rw-r--r--src/runtime/NEON/functions/NEBitwiseAnd.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseNot.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseOr.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseXor.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBoundingBoxTransform.cpp15
-rw-r--r--src/runtime/NEON/functions/NECast.cpp23
-rw-r--r--src/runtime/NEON/functions/NEChannelShuffleLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NEConcatenateLayer.cpp30
-rw-r--r--src/runtime/NEON/functions/NEConv3D.cpp87
-rw-r--r--src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp27
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp281
-rw-r--r--src/runtime/NEON/functions/NECopy.cpp14
-rw-r--r--src/runtime/NEON/functions/NECropResize.cpp56
-rw-r--r--src/runtime/NEON/functions/NEDeconvolutionLayer.cpp197
-rw-r--r--src/runtime/NEON/functions/NEDepthConvertLayer.cpp19
-rw-r--r--src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp21
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp205
-rw-r--r--src/runtime/NEON/functions/NEDequantizationLayer.cpp12
-rw-r--r--src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp62
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp29
-rw-r--r--src/runtime/NEON/functions/NEElementwiseOperations.cpp152
-rw-r--r--src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp17
-rw-r--r--src/runtime/NEON/functions/NEFFT1D.cpp33
-rw-r--r--src/runtime/NEON/functions/NEFFT2D.cpp15
-rw-r--r--src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp107
-rw-r--r--src/runtime/NEON/functions/NEFill.cpp12
-rw-r--r--src/runtime/NEON/functions/NEFillBorder.cpp13
-rw-r--r--src/runtime/NEON/functions/NEFlattenLayer.cpp24
-rw-r--r--src/runtime/NEON/functions/NEFloor.cpp14
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp495
-rw-r--r--src/runtime/NEON/functions/NEFuseBatchNormalization.cpp44
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp418
-rw-r--r--src/runtime/NEON/functions/NEGEMMConv2d.cpp79
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp658
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp631
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp178
-rw-r--r--src/runtime/NEON/functions/NEGather.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp189
-rw-r--r--src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp31
-rw-r--r--src/runtime/NEON/functions/NEL2NormalizeLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NELSTMLayer.cpp518
-rw-r--r--src/runtime/NEON/functions/NELSTMLayerQuantized.cpp404
-rw-r--r--src/runtime/NEON/functions/NELogical.cpp16
-rw-r--r--src/runtime/NEON/functions/NEMatMul.cpp85
-rw-r--r--src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp54
-rw-r--r--src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NENormalizationLayer.cpp14
-rw-r--r--src/runtime/NEON/functions/NEPReluLayer.cpp16
-rw-r--r--src/runtime/NEON/functions/NEPadLayer.cpp95
-rw-r--r--src/runtime/NEON/functions/NEPermute.cpp12
-rw-r--r--src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp52
-rw-r--r--src/runtime/NEON/functions/NEPooling3dLayer.cpp76
-rw-r--r--src/runtime/NEON/functions/NEPoolingLayer.cpp23
-rw-r--r--src/runtime/NEON/functions/NEPriorBoxLayer.cpp18
-rw-r--r--src/runtime/NEON/functions/NEQLSTMLayer.cpp1168
-rw-r--r--src/runtime/NEON/functions/NEQuantizationLayer.cpp12
-rw-r--r--src/runtime/NEON/functions/NERNNLayer.cpp55
-rw-r--r--src/runtime/NEON/functions/NEROIAlignLayer.cpp15
-rw-r--r--src/runtime/NEON/functions/NEROIPoolingLayer.cpp20
-rw-r--r--src/runtime/NEON/functions/NERange.cpp10
-rw-r--r--src/runtime/NEON/functions/NEReduceMean.cpp117
-rw-r--r--src/runtime/NEON/functions/NEReductionOperation.cpp79
-rw-r--r--src/runtime/NEON/functions/NERemap.cpp49
-rw-r--r--src/runtime/NEON/functions/NEReorderLayer.cpp (renamed from src/runtime/cpu/operators/CpuQuantize.cpp)48
-rw-r--r--src/runtime/NEON/functions/NEReorgLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NEReshapeLayer.cpp14
-rw-r--r--src/runtime/NEON/functions/NEReverse.cpp16
-rw-r--r--src/runtime/NEON/functions/NEScale.cpp105
-rw-r--r--src/runtime/NEON/functions/NESelect.cpp6
-rw-r--r--src/runtime/NEON/functions/NESlice.cpp39
-rw-r--r--src/runtime/NEON/functions/NESoftmaxLayer.cpp37
-rw-r--r--src/runtime/NEON/functions/NESpaceToBatchLayer.cpp40
-rw-r--r--src/runtime/NEON/functions/NESpaceToDepthLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NESplit.cpp2
-rw-r--r--src/runtime/NEON/functions/NEStackLayer.cpp35
-rw-r--r--src/runtime/NEON/functions/NEStridedSlice.cpp64
-rw-r--r--src/runtime/NEON/functions/NETile.cpp5
-rw-r--r--src/runtime/NEON/functions/NETranspose.cpp14
-rw-r--r--src/runtime/NEON/functions/NEUnstack.cpp38
-rw-r--r--src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp778
-rw-r--r--src/runtime/OMP/OMPScheduler.cpp56
-rw-r--r--src/runtime/OffsetLifetimeManager.cpp20
-rw-r--r--src/runtime/OffsetMemoryPool.cpp8
-rw-r--r--src/runtime/OperatorTensor.cpp3
-rw-r--r--src/runtime/PoolManager.cpp11
-rw-r--r--src/runtime/RuntimeContext.cpp3
-rw-r--r--src/runtime/Scheduler.cpp15
-rw-r--r--src/runtime/SchedulerFactory.cpp2
-rw-r--r--src/runtime/SchedulerUtils.cpp19
-rw-r--r--src/runtime/SubTensor.cpp3
-rw-r--r--src/runtime/Tensor.cpp3
-rw-r--r--src/runtime/TensorAllocator.cpp19
-rw-r--r--src/runtime/Utils.cpp15
-rw-r--r--src/runtime/cpu/ICpuOperator.h36
-rw-r--r--src/runtime/cpu/operators/CpuActivation.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuActivation.h58
-rw-r--r--src/runtime/cpu/operators/CpuAdd.cpp46
-rw-r--r--src/runtime/cpu/operators/CpuAdd.h77
-rw-r--r--src/runtime/cpu/operators/CpuCast.h73
-rw-r--r--src/runtime/cpu/operators/CpuConcatenate.cpp173
-rw-r--r--src/runtime/cpu/operators/CpuConcatenate.h83
-rw-r--r--src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp50
-rw-r--r--src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h62
-rw-r--r--src/runtime/cpu/operators/CpuCopy.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp523
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2d.h213
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp563
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h86
-rw-r--r--src/runtime/cpu/operators/CpuDequantize.h58
-rw-r--r--src/runtime/cpu/operators/CpuDirectConv2d.cpp147
-rw-r--r--src/runtime/cpu/operators/CpuDirectConv2d.h107
-rw-r--r--src/runtime/cpu/operators/CpuElementwise.cpp124
-rw-r--r--src/runtime/cpu/operators/CpuElementwise.h196
-rw-r--r--src/runtime/cpu/operators/CpuElementwiseUnary.cpp58
-rw-r--r--src/runtime/cpu/operators/CpuElementwiseUnary.h61
-rw-r--r--src/runtime/cpu/operators/CpuFill.cpp39
-rw-r--r--src/runtime/cpu/operators/CpuFill.h48
-rw-r--r--src/runtime/cpu/operators/CpuFlatten.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuFlatten.h66
-rw-r--r--src/runtime/cpu/operators/CpuFloor.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuFloor.h56
-rw-r--r--src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp206
-rw-r--r--src/runtime/cpu/operators/CpuGemmDirectConv2d.h112
-rw-r--r--src/runtime/cpu/operators/CpuMul.cpp77
-rw-r--r--src/runtime/cpu/operators/CpuMul.h109
-rw-r--r--src/runtime/cpu/operators/CpuPRelu.h38
-rw-r--r--src/runtime/cpu/operators/CpuPermute.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuPermute.h62
-rw-r--r--src/runtime/cpu/operators/CpuPool2d.cpp158
-rw-r--r--src/runtime/cpu/operators/CpuPool2d.h87
-rw-r--r--src/runtime/cpu/operators/CpuQuantize.h58
-rw-r--r--src/runtime/cpu/operators/CpuReshape.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuReshape.h57
-rw-r--r--src/runtime/cpu/operators/CpuScale.cpp254
-rw-r--r--src/runtime/cpu/operators/CpuScale.h73
-rw-r--r--src/runtime/cpu/operators/CpuSoftmax.cpp221
-rw-r--r--src/runtime/cpu/operators/CpuSoftmax.h117
-rw-r--r--src/runtime/cpu/operators/CpuSub.cpp46
-rw-r--r--src/runtime/cpu/operators/CpuSub.h86
-rw-r--r--src/runtime/cpu/operators/CpuTranspose.cpp44
-rw-r--r--src/runtime/cpu/operators/CpuTranspose.h56
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp869
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h126
-rw-r--r--src/runtime/cpu/utils/CpuAuxTensorHandler.h101
-rw-r--r--src/runtime/gpu/cl/IClOperator.h37
-rw-r--r--src/runtime/gpu/cl/operators/ClActivation.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClActivation.h60
-rw-r--r--src/runtime/gpu/cl/operators/ClAdd.cpp47
-rw-r--r--src/runtime/gpu/cl/operators/ClAdd.h100
-rw-r--r--src/runtime/gpu/cl/operators/ClCast.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClCast.h74
-rw-r--r--src/runtime/gpu/cl/operators/ClConcatenate.cpp254
-rw-r--r--src/runtime/gpu/cl/operators/ClConcatenate.h86
-rw-r--r--src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h62
-rw-r--r--src/runtime/gpu/cl/operators/ClCopy.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClCopy.h62
-rw-r--r--src/runtime/gpu/cl/operators/ClCrop.cpp46
-rw-r--r--src/runtime/gpu/cl/operators/ClCrop.h74
-rw-r--r--src/runtime/gpu/cl/operators/ClDequantize.cpp53
-rw-r--r--src/runtime/gpu/cl/operators/ClDequantize.h60
-rw-r--r--src/runtime/gpu/cl/operators/ClDirectConv2d.cpp106
-rw-r--r--src/runtime/gpu/cl/operators/ClDirectConv2d.h83
-rw-r--r--src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp92
-rw-r--r--src/runtime/gpu/cl/operators/ClElementwiseOperations.h190
-rw-r--r--src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp116
-rw-r--r--src/runtime/gpu/cl/operators/ClElementwiseUnary.h192
-rw-r--r--src/runtime/gpu/cl/operators/ClFill.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClFill.h61
-rw-r--r--src/runtime/gpu/cl/operators/ClFlatten.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClFlatten.h68
-rw-r--r--src/runtime/gpu/cl/operators/ClFloor.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClFloor.h58
-rw-r--r--src/runtime/gpu/cl/operators/ClGemm.cpp760
-rw-r--r--src/runtime/gpu/cl/operators/ClGemm.h136
-rw-r--r--src/runtime/gpu/cl/operators/ClLogicalNot.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClLogicalNot.h58
-rw-r--r--src/runtime/gpu/cl/operators/ClMul.cpp60
-rw-r--r--src/runtime/gpu/cl/operators/ClMul.h107
-rw-r--r--src/runtime/gpu/cl/operators/ClPRelu.cpp57
-rw-r--r--src/runtime/gpu/cl/operators/ClPRelu.h68
-rw-r--r--src/runtime/gpu/cl/operators/ClPermute.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClPermute.h64
-rw-r--r--src/runtime/gpu/cl/operators/ClPool2d.cpp101
-rw-r--r--src/runtime/gpu/cl/operators/ClPool2d.h72
-rw-r--r--src/runtime/gpu/cl/operators/ClQuantize.cpp53
-rw-r--r--src/runtime/gpu/cl/operators/ClQuantize.h62
-rw-r--r--src/runtime/gpu/cl/operators/ClReshape.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClReshape.h59
-rw-r--r--src/runtime/gpu/cl/operators/ClScale.cpp69
-rw-r--r--src/runtime/gpu/cl/operators/ClScale.h74
-rw-r--r--src/runtime/gpu/cl/operators/ClSoftmax.cpp186
-rw-r--r--src/runtime/gpu/cl/operators/ClSoftmax.h97
-rw-r--r--src/runtime/gpu/cl/operators/ClSub.cpp47
-rw-r--r--src/runtime/gpu/cl/operators/ClSub.h100
-rw-r--r--src/runtime/gpu/cl/operators/ClTranspose.cpp45
-rw-r--r--src/runtime/gpu/cl/operators/ClTranspose.h58
-rw-r--r--src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp299
-rw-r--r--src/runtime/gpu/cl/operators/ClWinogradConv2d.h126
-rw-r--r--src/runtime/gpu/cl/utils/ClAuxTensorHandler.h101
-rw-r--r--src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp205
-rw-r--r--src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h61
-rw-r--r--src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp413
-rw-r--r--src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h61
-rw-r--r--src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h65
-rw-r--r--src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h115
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp309
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h79
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp326
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h74
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp (renamed from src/runtime/cpu/operators/CpuDequantize.cpp)51
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h (renamed from src/runtime/cpu/operators/CpuCast.cpp)33
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h66
-rw-r--r--src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h120
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp167
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h (renamed from src/runtime/cpu/operators/CpuCopy.h)44
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h62
-rw-r--r--src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h108
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp314
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h62
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp113
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h57
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp134
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h83
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h62
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h63
-rw-r--r--src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h117
-rw-r--r--src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h135
376 files changed, 13785 insertions, 21947 deletions
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index ef7c62d64b..eca712dbf0 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -22,9 +22,9 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/Allocator.h"
-#include "arm_compute/runtime/MemoryRegion.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include <cstddef>
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 1c983aa329..8a0fc05c39 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,12 +30,12 @@
#include <algorithm>
#include <cmath>
+#include <iterator>
#include <map>
namespace arm_compute
{
-BlobLifetimeManager::BlobLifetimeManager()
- : _blobs()
+BlobLifetimeManager::BlobLifetimeManager() : _blobs()
{
}
@@ -61,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings()
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
// Sort free blobs requirements in descending order.
- _free_blobs.sort([](const Blob & ba, const Blob & bb)
- {
- return ba.max_size > bb.max_size;
- });
+ _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; });
// Create group sizes vector
std::vector<BlobInfo> group_sizes;
- std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
- {
- return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() };
- });
+ std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes),
+ [](const Blob &b) {
+ return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()};
+ });
// Update blob sizes
size_t max_size = std::max(_blobs.size(), group_sizes.size());
_blobs.resize(max_size);
group_sizes.resize(max_size);
- std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
- {
- return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) };
- });
+ std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs),
+ [](BlobInfo lhs, BlobInfo rhs)
+ {
+ return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment),
+ std::max(lhs.owners, rhs.owners)};
+ });
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
int blob_idx = 0;
- for(auto &free_blob : _free_blobs)
+ for (auto &free_blob : _free_blobs)
{
- for(auto &bound_element_id : free_blob.bound_elements)
+ for (auto &bound_element_id : free_blob.bound_elements)
{
ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
Element &bound_element = _active_elements[bound_element_id];
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 88e280537c..a2f63ef52b 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool()
void BlobMemoryPool::acquire(MemoryMappings &handles)
{
// Set memory to handlers
- for(auto &handle : handles)
+ for (auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
handle.first->set_region(_blobs[handle.second].get());
@@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles)
void BlobMemoryPool::release(MemoryMappings &handles)
{
- for(auto &handle : handles)
+ for (auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
handle.first->set_region(nullptr);
@@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- for(const auto &bi : blob_info)
+ for (const auto &bi : blob_info)
{
_blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
}
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index e06ef3d37d..b4545b93bf 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -35,7 +35,8 @@ namespace arm_compute
void *CLBufferAllocator::allocate(size_t size, size_t alignment)
{
ARM_COMPUTE_UNUSED(alignment);
- cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) };
+ cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+ nullptr, nullptr)};
return static_cast<void *>(buf);
}
diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index 7168259fcd..d680dc08bb 100644
--- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
@@ -27,8 +27,7 @@
namespace arm_compute
{
-CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle()
- : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
{
}
CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 5b4bbbcde0..eb28ecbf8d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void
* @return A pointer to the context properties which can be used to create an opencl context
*/
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform &platform,
+ const cl::Device &device,
+ std::array<cl_context_properties, 7> &prop)
{
ARM_COMPUTE_UNUSED(device);
#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
// Query devices in the context for cl_arm_printf support
- if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+ if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
{
// Create a cl_context with a printf_callback and user specified buffer size.
- std::array<cl_context_properties, 7> properties_printf =
- {
+ std::array<cl_context_properties, 7> properties_printf = {
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
// Enable a printf callback function for this context.
CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
// Request a minimum printf buffer size of 4MB for devices in the
// context that support this extension.
- CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
- 0
- };
+ CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
prop = properties_printf;
}
else
#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
{
- std::array<cl_context_properties, 3> properties =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
- 0
- };
+ std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+ reinterpret_cast<cl_context_properties>(platform()), 0};
std::copy(properties.begin(), properties.end(), prop.begin());
};
}
@@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
cl::Platform::get(&platforms);
ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
- cl::Platform selected_platform{ nullptr };
+ cl::Platform selected_platform{nullptr};
// If the user has selected the Native platform, return the first available.
- switch(cl_backend_type)
+ switch (cl_backend_type)
{
case CLBackendType::Native:
selected_platform = platforms[0];
break;
case CLBackendType::Clvk:
- for(auto p : platforms)
+ for (auto p : platforms)
{
std::string res = p.getInfo<CL_PLATFORM_NAME>();
- if(res.find("clvk") != std::string::npos)
+ if (res.find("clvk") != std::string::npos)
{
selected_platform = p;
break;
@@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
ARM_COMPUTE_ERROR("Unsupported backend type");
}
- if(!selected_platform())
+ if (!selected_platform())
{
ARM_COMPUTE_ERROR("No valid platform found");
}
@@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
return selected_platform;
}
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device(CLBackendType cl_backend_type)
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
{
ARM_COMPUTE_ERROR_ON(!opencl_is_available());
cl::Platform p = select_preferable_platform(cl_backend_type);
@@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
std::vector<cl::Device> platform_devices;
p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
- device = platform_devices[0];
- cl_int err = CL_SUCCESS;
- std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+ device = platform_devices[0];
+ cl_int err = CL_SUCCESS;
+ std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
initialise_context_properties(p, device, properties);
cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
- if(ctx)
+ if (ctx)
{
ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
ctx->gpu_scheduler()->enqueue(*kernel, flush);
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index a1743c56e6..c6ee6fde83 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -24,24 +24,22 @@
#include "arm_compute/runtime/CL/CLMemory.h"
#include "arm_compute/core/Error.h"
+
#include "support/Cast.h"
namespace arm_compute
{
-CLMemory::CLMemory()
- : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
{
}
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
- : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
}
-CLMemory::CLMemory(ICLMemoryRegion *memory)
- : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
{
_region = memory;
}
@@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
_region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
_region = _region_owned.get();
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 780a563d63..c9ddf9b85c 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,14 +26,12 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
ICLMemoryRegion::ICLMemoryRegion(size_t size)
- : IMemoryRegion(size),
- _queue(CLScheduler::get().queue()),
- _ctx(CLScheduler::get().context()),
- _mapping(nullptr),
- _mem()
+ : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
{
}
@@ -58,21 +56,34 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
return nullptr;
}
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size)
- : ICLMemoryRegion(size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
{
- if(_size != 0)
+ if (_size != 0)
{
_mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
}
}
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
- : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
{
_mem = buffer;
}
+CLBufferMemoryRegion::~CLBufferMemoryRegion()
+{
+ // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before
+ // this buffer is freed
+ // Do not call finish as it is a blocking call which affects the performance
+ try
+ {
+ CLScheduler::get().queue().flush();
+ }
+ catch (const std::exception &e)
+ {
+ ARM_COMPUTE_LOG_ERROR_ACL(e.what());
+ }
+}
+
void *CLBufferMemoryRegion::ptr()
{
return nullptr;
@@ -95,10 +106,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
: ICLMemoryRegion(size), _ptr(nullptr)
{
- if(size != 0)
+ if (size != 0)
{
_ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
- if(_ptr != nullptr)
+ if (_ptr != nullptr)
{
_mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
}
@@ -107,15 +118,18 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a
ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
{
- if(_ptr != nullptr)
+ if (_ptr != nullptr)
{
try
{
- clFinish(_queue.get());
+ // Can only use the blocking finish instead of the non-blocking flush here, because clSVMFree requires all
+ // commands that may use the svm pointer to finish beforehand
+ // https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clSVMFree.html
+ clFinish(CLScheduler::get().queue().get());
_mem = cl::Buffer();
clSVMFree(_ctx.get(), _ptr);
}
- catch(...)
+ catch (...)
{
}
}
@@ -134,7 +148,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size
void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
- clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+ clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+ nullptr);
_mapping = _ptr;
return _mapping;
}
@@ -153,7 +168,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si
void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
{
- if(blocking)
+ if (blocking)
{
clFinish(q.get());
}
diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 075a544077..89d4520038 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -30,14 +30,13 @@ namespace arm_compute
{
namespace experimental
{
-ICLOperator::ICLOperator(IRuntimeContext *ctx)
- : _kernel(), _ctx(ctx), _workspace()
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
{
}
void ICLOperator::run(ITensorPack &tensors)
{
- if(tensors.empty())
+ if (tensors.empty())
{
ARM_COMPUTE_ERROR("No inputs provided");
}
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 5083b4b0c5..b426b8c304 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,7 +30,10 @@
namespace arm_compute
{
CLRuntimeContext::CLRuntimeContext()
- : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type()
+ : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+ _gpu_scheduler(_gpu_owned_scheduler.get()),
+ _symbols(),
+ _backend_type()
{
_symbols.load_default();
auto ctx_dev_err = create_opencl_context_and_device(_backend_type);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index cb5f04ce8b..f0a42f55fd 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/runtime/CL/CLTuner.h"
+
#include "src/core/CL/ICLKernel.h"
namespace arm_compute
@@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
void CLScheduler::tune_kernel_static(ICLKernel &kernel)
{
- if(_cl_tuner != nullptr)
+ if (_cl_tuner != nullptr)
{
_cl_tuner->tune_kernel_static(kernel);
}
@@ -95,7 +96,16 @@ bool CLScheduler::is_initialised() const
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native)
+ : _context(),
+ _queue(),
+ _target(GPUTarget::MIDGARD),
+ _is_initialised(false),
+ _cl_tuner(nullptr),
+ _gemm_heuristics(nullptr),
+ _backend_type(CLBackendType::Native),
+ _job_chaining_enabled(true),
+ _job_chaining_size(1),
+ _job_chaining_count(0)
{
}
@@ -106,9 +116,12 @@ CLScheduler &CLScheduler::get()
return scheduler;
}
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device &device,
+ cl::Context &ctx,
+ ICLTuner *cl_tuner,
+ CLGEMMHeuristicsHandle *gemm_h)
{
- if(!_is_initialised)
+ if (!_is_initialised)
{
const std::string cl_kernels_folder("./cl_kernels/");
cl::CommandQueue queue = cl::CommandQueue(ctx, device);
@@ -120,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
{
- if(!_is_initialised)
+ if (!_is_initialised)
{
cl::Context ctx;
cl::Device dev;
@@ -132,8 +145,16 @@ void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_
init(ctx, queue, dev, cl_tuner, gemm_h);
}
- // Set CL tuner
- _cl_tuner = cl_tuner;
+ // Set CL tuner and GEMM heuristics
+ _cl_tuner = cl_tuner;
+ _gemm_heuristics = gemm_h;
+}
+
+void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+{
+ _is_initialised = false;
+
+ default_init(cl_tuner, gemm_h, cl_backend_type);
}
void CLScheduler::set_context(cl::Context context)
@@ -142,7 +163,12 @@ void CLScheduler::set_context(cl::Context context)
CLKernelLibrary::get().set_context(_context);
}
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context context,
+ cl::CommandQueue queue,
+ const cl::Device &device,
+ ICLTuner *cl_tuner,
+ CLGEMMHeuristicsHandle *gemm_h,
+ CLBackendType cl_backend_type)
{
set_context(std::move(context));
_queue = std::move(queue);
@@ -155,22 +181,49 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De
void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
{
- ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
- "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+ ARM_COMPUTE_ERROR_ON_MSG(
+ !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
const bool inject_memory = !tensors.empty();
// Tune the kernel if the CLTuner has been provided
- if(_cl_tuner != nullptr)
+ if (_cl_tuner != nullptr)
{
inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
}
// Run kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
+ if (_job_chaining_enabled)
+ {
+ ++_job_chaining_count;
+ }
- if(flush)
+ flush_queue(flush);
+}
+
+void CLScheduler::flush_queue(bool flush)
+{
+ if (_job_chaining_enabled)
+ {
+ if (_job_chaining_count >= _job_chaining_size)
+ {
+ _job_chaining_count = 0;
+ /*
+ Optimisation note: Flush the queue at the first enqueue to start the GPU
+ execution and then incrementally saturate the clFlush calls to minimize
+ the CPU activity for job-scheduling.
+ For eg. job-chain size goes from 1, 2, 4, 8 and 16
+ */
+ if (_job_chaining_size < 16)
+ {
+ _job_chaining_size <<= 1;
+ }
+ _queue.flush();
+ }
+ }
+ else if (flush)
{
_queue.flush();
}
@@ -186,4 +239,10 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush
{
enqueue_common(kernel, tensors, flush);
}
+
+void CLScheduler::enable_job_chaining(int job_chaining_size)
+{
+ _job_chaining_enabled = true;
+ _job_chaining_size = job_chaining_size;
+}
} // namespace arm_compute
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index d0822414c3..ace820bbb7 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,12 +29,14 @@
using namespace arm_compute;
-CLSubTensor::CLSubTensor()
- : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
{
}
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor *parent,
+ const TensorShape &tensor_shape,
+ const Coordinates &coords,
+ bool extend_parent)
: _parent(nullptr), _info()
{
ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,11 +83,15 @@ void CLSubTensor::unmap()
uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
- return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+ if (_parent->buffer() == nullptr)
+ {
+ _parent->map(q, blocking);
+ }
+ return _parent->buffer();
}
void CLSubTensor::do_unmap(cl::CommandQueue &q)
{
ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
- q.enqueueUnmapMemObject(cl_buffer(), buffer());
+ _parent->unmap(q);
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index f85b8ae777..e6457218c7 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr;
std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
{
// Try fine-grain SVM
- std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
- size,
- alignment);
+ std::unique_ptr<ICLMemoryRegion> region =
+ std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
// Try coarse-grain SVM in case of failure
- if(region != nullptr && region->ptr() == nullptr)
+ if (region != nullptr && region->ptr() == nullptr)
{
region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
}
// Try legacy buffer memory in case of failure
- if(region != nullptr && region->ptr() == nullptr)
+ if (region != nullptr && region->ptr() == nullptr)
{
region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
}
@@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset)
* @param[in] qinfo Quantization info
* @param[in] pad_size Pad size to use in case array needs to be padded for computation purposes
*/
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray &scale,
+ CLInt32Array &offset,
+ const QuantizationInfo &qinfo,
+ size_t pad_size)
{
clear_quantization_arrays(scale, offset);
@@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
const size_t element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
scale = CLFloatArray(num_elements + pad_size);
scale.resize(num_elements);
- CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+ CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+ qinfo.scale().data());
- if(!qinfo.offset().empty())
+ if (!qinfo.offset().empty())
{
// Create offset array
- const std::vector<int32_t> &qoffset = qinfo.offset();
- const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
- offset = CLInt32Array(num_elements + pad_size);
+ const std::vector<int32_t> &qoffset = qinfo.offset();
+ const size_t offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+ offset = CLInt32Array(num_elements + pad_size);
offset.resize(num_elements);
- CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+ CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+ num_elements * offset_element_size, qinfo.offset().data());
}
}
} // namespace
@@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext
CLQuantization CLTensorAllocator::quantization() const
{
- return { &_scale, &_offset };
+ return {&_scale, &_offset};
}
uint8_t *CLTensorAllocator::data()
@@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const
void CLTensorAllocator::allocate()
{
// Allocate tensor backing memory
- if(_associated_memory_group == nullptr)
+ if (_associated_memory_group == nullptr)
{
// Perform memory allocation
- if(static_global_cl_allocator != nullptr)
+ if (static_global_cl_allocator != nullptr)
{
_memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
}
@@ -146,7 +150,7 @@ void CLTensorAllocator::allocate()
}
// Allocate and fill the quantization parameter arrays
- if(is_data_type_quantized_per_channel(info().data_type()))
+ if (is_data_type_quantized_per_channel(info().data_type()))
{
const size_t pad_size = 0;
populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
uint8_t *CLTensorAllocator::lock()
{
- if(_ctx)
+ if (_ctx)
{
return map(_ctx->gpu_scheduler()->queue(), true);
}
@@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock()
void CLTensorAllocator::unlock()
{
ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- if(_ctx)
+ if (_ctx)
{
unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index e16d6808ed..0d62fe3afe 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,10 +22,12 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
@@ -36,10 +38,36 @@
namespace arm_compute
{
CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
- : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
+ : real_clEnqueueNDRangeKernel(nullptr),
+ _tuning_params_table(),
+ _lws_table(),
+ _kernel_event(),
+ _tune_new_kernels(tune_new_kernels),
+ _tuning_info(tuning_info)
{
}
+struct CLTuner::IKernelData
+{
+ virtual ~IKernelData() = default;
+ virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
+};
+struct DefaultKernelData : public CLTuner::IKernelData
+{
+ DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
+ {
+ }
+ ~DefaultKernelData() override = default;
+ void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+ {
+ const bool inject_memory = !_tensors.empty();
+ inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue);
+ }
+
+private:
+ ITensorPack &_tensors;
+};
+
bool CLTuner::kernel_event_is_set() const
{
return _kernel_event() != nullptr;
@@ -63,11 +91,6 @@ void CLTuner::set_tuner_mode(CLTunerMode mode)
_tuning_info.tuner_mode = mode;
}
-CLTunerMode CLTuner::get_tuner_mode() const
-{
- return _tuning_info.tuner_mode;
-}
-
void CLTuner::tune_kernel_static(ICLKernel &kernel)
{
ARM_COMPUTE_UNUSED(kernel);
@@ -79,29 +102,30 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
tune_kernel_dynamic(kernel, pack);
}
-void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
{
// Get the configuration ID from the kernel and append GPU target name and number of available compute units
- const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+ const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+ support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
// Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
- if(kernel.config_id() != arm_compute::default_config_id)
+ if (kernel.config_id() != arm_compute::default_config_id)
{
auto p = _tuning_params_table.find(config_id);
- if(p == _tuning_params_table.end())
+ if (p == _tuning_params_table.end())
{
- if(_tune_new_kernels)
+ if (_tune_new_kernels)
{
// Find the optimal LWS for the kernel
- CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, tensors);
+ CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
// Insert the optimal LWS in the table
add_tuning_params(config_id, opt_tuning_params);
// Set Local-Workgroup-Size
kernel.set_lws_hint(opt_tuning_params.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
}
@@ -111,17 +135,18 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
{
// Set Local-Workgroup-Size
kernel.set_lws_hint(p->second.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
kernel.set_wbsm_hint(p->second.get_wbsm());
}
}
}
}
-
-void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws)
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
{
- add_tuning_params(kernel_id, CLTuningParams(optimal_lws));
+ DefaultKernelData data{tensors};
+
+ do_tune_kernel_dynamic(kernel, &data);
}
void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params)
@@ -129,13 +154,13 @@ void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams opt
_tuning_params_table.emplace(kernel_id, optimal_tuning_params);
}
-CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors)
+CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data)
{
// Profiling queue
cl::CommandQueue queue_profiler;
// Extract real OpenCL function to intercept
- if(real_clEnqueueNDRangeKernel == nullptr)
+ if (real_clEnqueueNDRangeKernel == nullptr)
{
real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
}
@@ -146,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
// Check if we can use the OpenCL timer with the default queue
cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
- if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
{
// Set the queue for profiling
queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -157,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
}
// Start intercepting enqueues:
- auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event * event_wait_list, cl_event * event)
+ auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+ const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event)
{
- if(this->kernel_event_is_set())
+ if (this->kernel_event_is_set())
{
// If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
return CL_SUCCESS;
}
cl_event tmp;
- cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+ cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+ num_events_in_wait_list, event_wait_list, &tmp);
// Set OpenCL event
this->set_cl_kernel_event(tmp);
- if(event != nullptr)
+ if (event != nullptr)
{
//return cl_event from the intercepted call
clRetainEvent(tmp);
@@ -181,11 +208,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
- cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
-
// Run the kernel with default lws to be used as baseline
- const bool inject_memory = !tensors.empty();
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
+
+ /// Get the cached gws used by the kernel
+ /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws
+ /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step.
+ /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
+ /// Please see COMPMID-5934
+ cl::NDRange gws = kernel.get_cached_gws();
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+ arm_compute::logging::LogLevel::INFO,
+ "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+ to_string(gws).c_str());
queue_profiler.finish();
@@ -198,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
// Construct the list of tuning parameters values to be tested based on the tuner mode.
auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
- for(size_t i = 0; i < tuning_list->size(); ++i)
+ for (size_t i = 0; i < tuning_list->size(); ++i)
{
CLTuningParams tuning_test = (*tuning_list)[i];
// Setting the lws
@@ -208,20 +243,22 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
auto z = lws_test[2];
const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
- if(invalid_lws)
+ if (invalid_lws)
{
continue;
}
kernel.set_lws_hint(lws_test);
- if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+ if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
{
cl_int wbsm_test = tuning_test.get_wbsm();
kernel.set_wbsm_hint(wbsm_test);
}
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
+ to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
// Run the kernel
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
queue_profiler.finish();
@@ -231,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
_kernel_event = nullptr;
// Check the execution time
- if(diff < min_exec_time)
+ if (diff < min_exec_time)
{
min_exec_time = diff;
opt_tuning_params.set_lws(tuning_test.get_lws());
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
}
@@ -247,25 +284,6 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
return opt_tuning_params;
}
-void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
-{
- _tuning_params_table.clear();
- for(auto && params : lws_table)
- {
- add_tuning_params(params.first, CLTuningParams(params.second));
- }
-}
-
-const std::unordered_map<std::string, cl::NDRange> &CLTuner::lws_table()
-{
- _lws_table.clear();
- for(auto && params : _tuning_params_table)
- {
- _lws_table.emplace(params.first, params.second.get_lws());
- }
- return _lws_table;
-}
-
const std::unordered_map<std::string, CLTuningParams> &CLTuner::tuning_params_table() const
{
return _tuning_params_table;
@@ -282,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename)
std::ifstream fs;
fs.exceptions(std::ifstream::badbit);
fs.open(filename, std::ios::in);
- if(!fs.is_open())
+ if (!fs.is_open())
{
ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
}
std::string line;
bool header_line = true;
- while(!std::getline(fs, line).fail())
+ while (!std::getline(fs, line).fail())
{
- if(header_line)
+ if (header_line)
{
header_line = false;
size_t pos_lws = line.find("lws");
size_t pos_wbsm = line.find("wbsm");
_tuning_info.tune_wbsm = false;
- if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
+ if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
{
// The file has in the first line the parameters it has been tuned on
- if(pos_wbsm != std::string::npos)
+ if (pos_wbsm != std::string::npos)
{
_tuning_info.tune_wbsm = true;
}
// Once the line with the tuning parameter is read we can
// read the next one to start collecting the values
- if(std::getline(fs, line).fail())
+ if (std::getline(fs, line).fail())
{
break;
}
@@ -314,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename)
CLTuningParams tuning_params;
size_t pos = line.find(";");
- if(pos == std::string::npos)
+ if (pos == std::string::npos)
{
ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
std::string kernel_id = line.substr(0, pos);
line.erase(0, pos + 1);
- if(!tuning_params.from_string(_tuning_info, line))
+ if (!tuning_params.from_string(_tuning_info, line))
{
ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
@@ -331,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename)
bool CLTuner::save_to_file(const std::string &filename) const
{
- if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+ if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
{
return false;
}
@@ -340,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const
fs.open(filename, std::ios::out);
std::string header_string = "";
header_string += "lws";
- if(_tuning_info.tune_wbsm)
+ if (_tuning_info.tune_wbsm)
{
- if(!header_string.empty())
+ if (!header_string.empty())
{
header_string += " ";
}
header_string += "wbsm";
}
fs << header_string << std::endl;
- for(auto const &kernel_data : _tuning_params_table)
+ for (auto const &kernel_data : _tuning_params_table)
{
CLTuningParams tun_pams(kernel_data.second);
fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index 4530537789..bc782c3a2c 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -26,15 +26,14 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
using namespace arm_compute;
ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
- : _kernel(),
- _border_handler(std::make_unique<CLFillBorderKernel>()),
- _ctx(ctx)
+ : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
{
}
diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index da3d4850bf..294396c28a 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp
@@ -35,20 +35,20 @@ namespace arm_compute
void restore_program_cache_from_file(const std::string &filename)
{
std::ifstream cache_file(filename, std::ios::binary);
- if(cache_file.is_open())
+ if (cache_file.is_open())
{
- if(!CLScheduler::get().is_initialised())
+ if (!CLScheduler::get().is_initialised())
{
arm_compute::CLScheduler::get().default_init();
}
- while(!cache_file.eof())
+ while (!cache_file.eof())
{
size_t name_len = 0;
size_t binary_len = 0;
cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
- if(name_len == 0 || binary_len == 0)
+ if (name_len == 0 || binary_len == 0)
{
break;
}
@@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename)
tmp.resize(binary_len);
cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
cl::Context context = arm_compute::CLScheduler::get().context();
- cl::Program::Binaries binaries{ binary };
+ cl::Program::Binaries binaries{binary};
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program program(context, devices, binaries);
program.build();
@@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename)
void save_program_cache_to_file(const std::string &filename)
{
- if(CLScheduler::get().is_initialised())
+ if (CLScheduler::get().is_initialised())
{
std::ofstream cache_file(filename, std::ios::binary);
- if(cache_file.is_open())
+ if (cache_file.is_open())
{
- for(const auto &it : CLKernelLibrary::get().get_built_programs())
+ for (const auto &it : CLKernelLibrary::get().get_built_programs())
{
std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9c71b2aa7d..c035644e4a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,26 +26,27 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClActivation.h"
namespace arm_compute
{
struct CLActivationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- CLRuntimeContext *ctx{ nullptr };
- std::unique_ptr<opencl::ClActivation> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ CLRuntimeContext *ctx{nullptr};
+ std::unique_ptr<opencl::ClActivation> op{nullptr};
};
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
CLActivationLayer::~CLActivationLayer() = default;
@@ -54,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio
configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
}
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -65,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
}
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
return opencl::ClActivation::validate(input, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 8c32563abb..f9bbd31e8a 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,8 +27,10 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -37,76 +39,52 @@
namespace arm_compute
{
CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape(), _num_of_stages(), _reduction_axis()
+ : _memory_group(std::move(memory_manager)),
+ _not_reshaped_output(),
+ _arg_min_max_kernel(),
+ _reshape(),
+ _reduction_axis()
{
}
CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Invalid reduction operation");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- const unsigned int num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
DataType output_data_type = DataType::S32;
TensorInfo not_reshaped_output;
const auto input_num_channles = input->num_channels();
const auto input_qinfo = input->quantization_info();
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
output_data_type = output->data_type();
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
}
auto shape_before_reshape = input->tensor_shape();
shape_before_reshape.set(axis, 1);
- auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
- {
+ auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+ QuantizationInfo qinfo) {
ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
};
initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
- if(num_of_stages == 1)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &not_reshaped_output, axis, op));
- }
- else
- {
- // Create temporary tensor infos
- std::vector<TensorInfo> sums_vector(num_of_stages - 1);
-
- // Create intermediate tensor info
- TensorShape shape{ input->tensor_shape() };
-
- for(unsigned int i = 0; i < num_of_stages - 1; i++)
- {
- shape.set(0, ceil(shape.x() / 128.f));
- sums_vector[i].set_data_type(input->data_type());
- sums_vector[i].set_tensor_shape(shape);
- sums_vector[i].set_num_channels(input->num_channels());
- }
-
- // Validate ReductionOperation only on first kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &sums_vector[0], axis, op));
-
- // Validate ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < num_of_stages - 1; ++i)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
- }
-
- // Validate ReductionOperation on the last stage
- const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &not_reshaped_output, axis, op));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
return Status{};
}
@@ -116,58 +94,42 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
}
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int axis,
+ ICLTensor *output,
+ const ReductionOperation &op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
- _reduction_axis = axis;
-
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
- // Configure reduction operation kernels
- _reduction_kernels_vector.reserve(_num_of_stages);
+ _reduction_axis = axis;
- auto add_reduction_kernel = [this, &compile_context, axis, op](const ICLTensor * input, const ICLTensor * prev_output, ICLTensor * output)
- {
- _reduction_kernels_vector.emplace_back(std::make_unique<CLArgMinMaxLayerKernel>());
- _reduction_kernels_vector.back()->configure(compile_context, input, prev_output, output, axis, op);
- };
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ DataType output_data_type =
+ (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
+ not_reshaped_output_shape.set(axis, 1);
+ auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+ ->clone()
+ ->set_tensor_shape(not_reshaped_output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
+ _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
_memory_group.manage(&_not_reshaped_output);
- // Create temporary tensors
- if(_num_of_stages == 1)
- {
- add_reduction_kernel(input, nullptr, &_not_reshaped_output);
- }
- else
- {
- _results_vector.resize(_num_of_stages - 1);
- TensorShape shape{ input->info()->tensor_shape() };
- for(unsigned int i = 0; i < _num_of_stages - 1; i++)
- {
- shape.set(0, ceil(shape.x() / 128.f));
- _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
- }
-
- // Apply ReductionOperation only on first kernel
- _memory_group.manage(&_results_vector[0]);
- add_reduction_kernel(input, nullptr, &_results_vector[0]);
-
- // Apply ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
- {
- _memory_group.manage(&_results_vector[i]);
- add_reduction_kernel(input, &_results_vector[i - 1], &_results_vector[i]);
- _results_vector[i - 1].allocator()->allocate();
- }
-
- // Apply ReductionOperation on the last stage
- const unsigned int last_stage = _num_of_stages - 1;
- add_reduction_kernel(input, &_results_vector[last_stage - 1], &_not_reshaped_output);
- _results_vector[last_stage - 1].allocator()->allocate();
- }
+
_reshape.configure(compile_context, &_not_reshaped_output, output);
_not_reshaped_output.allocator()->allocate();
}
@@ -176,10 +138,7 @@ void CLArgMinMaxLayer::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- for(unsigned int i = 0; i < _num_of_stages; ++i)
- {
- CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
- }
+ CLScheduler::get().enqueue(*_arg_min_max_kernel, false);
_reshape.run();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 6b76da81c6..0c371c4171 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
namespace arm_compute
@@ -41,23 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer()
CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
ActivationLayerInfo act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
}
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
- const ICLTensor *gamma, float epsilon,
- ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *var,
+ const ICLTensor *beta,
+ const ICLTensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
_norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
}
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
}
@@ -66,4 +84,4 @@ void CLBatchNormalizationLayer::run()
{
CLScheduler::get().enqueue(*_norm_kernel, true);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index c2fdb74777..a3798daf61 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,12 +30,12 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
namespace arm_compute
{
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
- : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
{
}
@@ -43,32 +43,49 @@ CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default;
void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
+ _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
}
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
_batch_to_space_kernel->configure(compile_context, input, block_shape, output);
}
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(
+ const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
}
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ICLTensor *output,
+ const CropInfo &crop_info)
{
- _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
+ _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
}
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
}
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
- return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+ return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
}
void CLBatchToSpaceLayer::run()
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 70e27c0cca..7bfd0e3677 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -34,10 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 7970a1698b..9763915c02 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -36,8 +37,9 @@ void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index fbda9ad289..dd3171b982 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -34,10 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index 4f4b74c04c..5bee4b37ec 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBitwiseKernel.h"
#include <utility>
@@ -34,10 +35,14 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<CLBitwiseKernel>();
k->configure(compile_context, input1, input2, output, BitwiseOperation::XOR);
_kernel = std::move(k);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 0dade0a369..76e626fd75 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,24 +23,37 @@
*/
#include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
namespace arm_compute
{
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
}
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context,
+ const ICLTensor *boxes,
+ ICLTensor *pred_boxes,
+ const ICLTensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
+
// Configure Bounding Box kernel
auto k = std::make_unique<CLBoundingBoxTransformKernel>();
k->configure(compile_context, boxes, pred_boxes, deltas, info);
_kernel = std::move(k);
}
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 53256ebed4..42ec8f7ee0 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -26,8 +26,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCast.h"
+#include "src/gpu/cl/operators/ClCast.h"
#include <utility>
@@ -35,16 +37,15 @@ namespace arm_compute
{
struct CLCast::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCast> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCast> op{nullptr};
};
-CLCast::CLCast()
- : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
{
}
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&) = default;
CLCast &CLCast::operator=(CLCast &&) = default;
CLCast::~CLCast() = default;
@@ -53,9 +54,13 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy
configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
}
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ ConvertPolicy policy)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, policy);
_impl->src = input;
_impl->dst = output;
@@ -71,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
void CLCast::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index c6af5a05d5..1ee4789816 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
namespace arm_compute
@@ -33,8 +35,12 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output,
configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
}
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ unsigned int num_groups)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
auto k = std::make_unique<CLChannelShuffleLayerKernel>();
k->configure(compile_context, input, output, num_groups);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 4122928578..2f54371e88 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,8 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLComparisonKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
@@ -35,24 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
}
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ ComparisonOperation operation)
{
+ ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
auto k = std::make_unique<CLComparisonKernel>();
k->configure(compile_context, input1, input2, output, operation);
_kernel = std::move(k);
- if(output->info()->dimension(0) > 1)
+ if (output->info()->dimension(0) > 1)
{
ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
- if(broadcasted_info->info()->dimension(0) == 1)
+ if (broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+ BorderMode::REPLICATE);
}
}
}
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation operation)
{
return CLComparisonKernel::validate(input1, input2, output, operation);
}
@@ -64,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
}
template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
auto k = std::make_unique<CLComparisonKernel>();
k->configure(compile_context, input1, input2, output, COP);
_kernel = std::move(k);
- if(output->info()->dimension(0) > 1)
+ if (output->info()->dimension(0) > 1)
{
ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
- if(broadcasted_info->info()->dimension(0) == 1)
+ if (broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+ BorderMode::REPLICATE);
}
}
}
template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
{
return CLComparisonKernel::validate(input1, input2, output, COP);
}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index ea96e45bf8..9df1c34593 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -24,22 +24,23 @@
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
namespace arm_compute
{
struct CLConcatenateLayer::Impl
{
std::vector<const ICLTensor *> srcs{};
- ICLTensor *dst{ nullptr };
- unsigned int num_inputs{ 0 };
- unsigned int axis{ 0 };
- std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+ ICLTensor *dst{nullptr};
+ unsigned int num_inputs{0};
+ unsigned int axis{0};
+ std::unique_ptr<opencl::ClConcatenate> op{nullptr};
};
-CLConcatenateLayer::CLConcatenateLayer()
- : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
{
}
@@ -54,9 +55,13 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector
configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
}
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context,
+ std::vector<const ICLTensor *> &inputs_vector,
+ ICLTensor *output,
+ size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
_impl->srcs = inputs_vector;
_impl->dst = output;
@@ -65,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
_impl->op = std::make_unique<opencl::ClConcatenate>();
std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+ for (unsigned int i = 0; i < inputs_vector.size(); ++i)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -73,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
_impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
}
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+ const ITensorInfo *output,
+ size_t axis)
{
return opencl::ClConcatenate::validate(inputs_vector, output, axis);
}
@@ -81,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
void CLConcatenateLayer::run()
{
ITensorPack pack;
- for(unsigned i = 0; i < _impl->num_inputs; ++i)
+ for (unsigned i = 0; i < _impl->num_inputs; ++i)
{
pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
}
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
new file mode 100644
index 0000000000..9d1b368f72
--- /dev/null
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConv3D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/gpu/cl/operators/ClDirectConv3d.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::experimental;
+
+struct CLConv3D::Impl
+{
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
+};
+
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLConv3D::~CLConv3D() = default;
+
+void CLConv3D::configure(const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
+}
+
+void CLConv3D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+ src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+
+ _impl->src = src;
+ _impl->weights = weights;
+ _impl->biases = biases;
+ _impl->dst = dst;
+
+ _impl->op = std::make_unique<opencl::ClDirectConv3d>();
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+ _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+}
+
+Status CLConv3D::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv3d_info)
+{
+ return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
+}
+
+void CLConv3D::run()
+{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+ _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 8189eee402..2298f2a669 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,41 +27,50 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
namespace arm_compute
{
struct CLConvertFullyConnectedWeights::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
};
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
- : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
{
}
CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
}
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
_impl->src = input;
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClConvertFullyConnectedWeights>();
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
}
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
}
@@ -74,4 +83,4 @@ void CLConvertFullyConnectedWeights::run()
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 96d7cc72c2..7767b45a01 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,115 +23,149 @@
*/
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include <cmath>
-#include <memory>
-#include <tuple>
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+#include "support/Cast.h"
namespace arm_compute
{
using namespace arm_compute::misc::shape_calculator;
-
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _function()
+using namespace arm_compute::experimental;
+struct CLConvolutionLayer::Impl
+{
+ MemoryGroup memory_group{};
+ std::shared_ptr<IMemoryManager> memory_manager{};
+ std::unique_ptr<opencl::IClOperator> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<CLTensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+ std::unique_ptr<IFunction> func{nullptr};
+};
+
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
+ _impl->memory_manager = std::move(memory_manager);
}
CLConvolutionLayer::~CLConvolutionLayer() = default;
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, enable_fast_math, num_groups);
}
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
+
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
- weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
+ switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+ weights_info, CLScheduler::get().target()))
{
case ConvolutionMethod::WINOGRAD:
- {
- ARM_COMPUTE_ERROR_ON(num_groups != 1);
- auto f = std::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
- f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::DIRECT:
- {
- ARM_COMPUTE_ERROR_ON(num_groups != 1);
- auto f = std::make_unique<CLDirectConvolutionLayer>();
- f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
- _function = std::move(f);
- break;
- }
+ case ConvolutionMethod::INDIRECT:
case ConvolutionMethod::GEMM:
{
- auto f = std::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
- f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
- _function = std::move(f);
+ auto f = std::make_unique<opencl::ClConv2d>();
+ f->configure(compile_context, input->info(), weights->info(),
+ ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+ _impl->op = std::move(f);
break;
}
case ConvolutionMethod::FFT:
{
- auto f = std::make_unique<CLFFTConvolutionLayer>(_memory_manager);
+ auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
- _function = std::move(f);
+ _impl->func = std::move(f);
break;
}
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
}
+
+ if (_impl->op)
+ {
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ }
}
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+ "Grouping (num_groups != 1) with NHWC data layout is not supported");
- const GPUTarget gpu_target = CLScheduler::get().target();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
+ switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
{
case ConvolutionMethod::WINOGRAD:
- {
- //Validate Winograd
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported");
- ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
- break;
- }
case ConvolutionMethod::DIRECT:
- {
- // Validate direct convolution layer
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported");
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
- break;
- }
+ case ConvolutionMethod::INDIRECT:
case ConvolutionMethod::GEMM:
{
- // Validate gemm-based convolution layer
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
break;
}
case ConvolutionMethod::FFT:
{
// Validate FFT-based convolution layer
- ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+ act_info, enable_fast_math));
break;
}
default:
@@ -142,120 +176,48 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
return Status{};
}
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const ActivationLayerInfo &act_info,
+ const GPUTarget gpu_target,
+ const Size2D &dilation,
+ bool enable_fast_math)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_UNUSED(weights_info);
- ARM_COMPUTE_UNUSED(gpu_target);
-
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- /* Input spatial dims, kernel size, IFM/OFM, conv info*/
- using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
- using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
- const std::vector<ConfigurationMethod> known_configs =
- {
- // Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
- // VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
- };
-
- const auto find_config = [&](ConfigurationMethod c)
- {
- const ConvolutionConfiguration config = c.first;
- const PadStrideInfo info = std::get<3>(config);
- const DataLayout data_layout = std::get<4>(config);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
+ return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
+}
- return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == input->data_layout());
- };
+void CLConvolutionLayer::run()
+{
+ prepare();
- std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
- {
- return (*found).second;
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
- if(dilation != Size2D(1U, 1U))
+ if (_impl->func)
{
- return ConvolutionMethod::GEMM;
+ _impl->func->run();
}
else
{
- if(input->data_layout() == DataLayout::NCHW)
- {
- // SRGAN
- if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
- && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::DIRECT;
- }
- if((weights->dimension(idx_h) > 5) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
- {
- return ConvolutionMethod::FFT;
- }
- if(input->dimension(idx_c) < 16)
- {
- return ConvolutionMethod::GEMM;
- }
- return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
- }
- else
- {
- // SRGAN
- if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
- && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::DIRECT;
- }
- if(gpu_target == GPUTarget::G71)
- {
- if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) >= output->dimension(idx_c))
- && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
- {
- return ConvolutionMethod::FFT;
- }
- }
- else if(is_data_type_float(input->data_type()))
- {
- if((weights->dimension(idx_h) >= 5) && (input->dimension(idx_c) >= output->dimension(idx_c)) && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::DIRECT;
- }
- }
- if(input->dimension(idx_c) < 16)
- {
- return ConvolutionMethod::GEMM;
- }
- return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
- }
+ _impl->op->run(_impl->run_pack);
}
}
-void CLConvolutionLayer::run()
-{
- prepare();
- _function->run();
-}
-
void CLConvolutionLayer::prepare()
{
- _function->prepare();
+ if (_impl->func)
+ {
+ _impl->func->prepare();
+ }
+ else
+ {
+ _impl->op->prepare(_impl->prep_pack);
+
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries(_impl->aux_mem_req, _impl->workspace);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 98916bf38a..a4f2b0634f 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -27,8 +27,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCopy.h"
+#include "src/gpu/cl/operators/ClCopy.h"
#include <utility>
@@ -36,16 +38,15 @@ namespace arm_compute
{
struct CLCopy::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCopy> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCopy> op{nullptr};
};
-CLCopy::CLCopy()
- : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
{
}
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&) = default;
CLCopy &CLCopy::operator=(CLCopy &&) = default;
CLCopy::~CLCopy() = default;
@@ -57,6 +58,7 @@ void CLCopy::configure(ICLTensor *input, ICLTensor *output, Window *dst_window)
void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_LOG_PARAMS(input, output, dst_window);
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 20cab4df5f..fc29c43827 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -27,8 +27,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCrop.h"
+#include "src/gpu/cl/operators/ClCrop.h"
#include <utility>
@@ -36,38 +38,57 @@ namespace arm_compute
{
struct CLCrop::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCrop> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCrop> op{nullptr};
};
-CLCrop::CLCrop()
- : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
{
}
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&) = default;
CLCrop &CLCrop::operator=(CLCrop &&) = default;
CLCrop::~CLCrop() = default;
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
- Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+ ICLTensor *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
- configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+ configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+ dst_window);
}
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
- Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ ICLTensor *dst,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
_impl->src = src;
_impl->dst = dst;
_impl->op = std::make_unique<opencl::ClCrop>();
- _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+ extrapolation_value, dst_window);
}
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ Coordinates2D start,
+ Coordinates2D end,
+ uint32_t batch_index,
+ float extrapolation_value,
+ Window *dst_window)
{
return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
}
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 77c44d539b..821412b149 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,6 +25,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -35,7 +37,14 @@ namespace arm_compute
{
namespace
{
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+ ICLTensor *crop_boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ uint32_t crop_box_ind,
+ Coordinates &start,
+ Coordinates &end,
+ uint32_t &batch_index)
{
batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
@@ -48,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
// The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
- end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
- std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
- const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+ end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+ static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
output->info()->set_tensor_shape(out_shape);
}
} // namespace
CLCropResize::CLCropResize()
- : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+ : _input(nullptr),
+ _boxes(nullptr),
+ _box_ind(nullptr),
+ _output(nullptr),
+ _num_boxes(0),
+ _method(),
+ _extrapolation_value(0),
+ _scale(),
+ _copy(),
+ _crop_results(),
+ _scaled_results(),
+ _internal_functions()
{
}
CLCropResize::~CLCropResize() = default;
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo *input,
+ ITensorInfo *boxes,
+ ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
TensorInfo temp_info;
- ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
- if(output->total_size() > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+ input->dimension(3) - 1, extrapolation_value));
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -81,19 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
return Status{};
}
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+ configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+ extrapolation_value);
}
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *boxes,
+ ICLTensor *box_ind,
+ ICLTensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
- ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+ crop_size, method, extrapolation_value));
+ ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
- TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+ TensorShape output_shape =
+ TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
_num_boxes = boxes->info()->tensor_shape()[1];
@@ -119,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
// kernels used for cropping and scaling.
_boxes->map(CLScheduler::get().queue());
_box_ind->map(CLScheduler::get().queue());
- for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+ for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
{
auto crop_tensor = std::make_unique<CLTensor>();
TensorInfo crop_result_info(1, DataType::F32);
@@ -140,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
auto scale_kernel = std::make_unique<CLScale>();
- scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+ scale_kernel->configure(
+ compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+ ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
_scale.emplace_back(std::move(scale_kernel));
Window win = calculate_max_window(*_output->info());
@@ -156,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
bool is_width_flipped = end[0] < start[0];
bool is_height_flipped = end[1] < start[1];
/** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
- std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+ std::array<int32_t, 2> rows_out_of_bounds{0};
/** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
- std::array<int32_t, 2> cols_out_of_bounds{ 0 };
- if(is_height_flipped)
+ std::array<int32_t, 2> cols_out_of_bounds{0};
+ if (is_height_flipped)
{
- rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
- rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+ rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(start[1] - _input->info()->dimension(2) + 1,
+ _crop_results[num_box].get()->info()->dimension(2))
+ : 0;
+ rows_out_of_bounds[1] =
+ end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+ : 0;
}
else
{
- rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
- rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+ rows_out_of_bounds[0] =
+ start[1] < 0
+ ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+ : 0;
+ rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(end[1] - _input->info()->dimension(2) + 1,
+ _crop_results[num_box].get()->info()->dimension(2))
+ : 0;
}
- if(is_width_flipped)
+ if (is_width_flipped)
{
- cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
- cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+ cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(start[0] - _input->info()->dimension(1) + 1,
+ _crop_results[num_box].get()->info()->dimension(1))
+ : 0;
+ cols_out_of_bounds[1] =
+ end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+ : 0;
}
else
{
- cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
- cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+ cols_out_of_bounds[0] =
+ start[0] < 0
+ ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+ : 0;
+ cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(end[0] - _input->info()->dimension(1) + 1,
+ _crop_results[num_box].get()->info()->dimension(1))
+ : 0;
}
Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -200,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
// Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
// with the extrapolation value using memset.
// First for the rows before the in bounds rows.
- if(rows_out_of_bounds[0] > 0)
+ if (rows_out_of_bounds[0] > 0)
{
Window slice_fill_rows_before(full_window);
slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_rows_before);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
Window slice_in(full_window);
- slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
- slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
- int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
- if(rows_in_bounds > 0)
+ slice_in.set(2,
+ Window::Dimension(rows_out_of_bounds[0],
+ _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+ slice_in.set(1,
+ Window::Dimension(cols_out_of_bounds[0],
+ _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+ int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+ rows_out_of_bounds[0] - rows_out_of_bounds[1];
+ if (rows_in_bounds > 0)
{
// Fill all elements that share a row with an in bounds element with the extrapolation value.
- if(cols_out_of_bounds[0] > 0)
+ if (cols_out_of_bounds[0] > 0)
{
Window slice_fill_cols_before(slice_in);
slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_cols_before);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
- if(cols_out_of_bounds[1] > 0)
+ if (cols_out_of_bounds[1] > 0)
{
Window slice_fill_cols_after(slice_in);
- slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+ slice_fill_cols_after.set(
+ 1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+ _crop_results[num_box].get()->info()->dimension(1), 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_cols_after);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
// Copy all elements within the input bounds from the input tensor.
- int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
- if(cols_in_bounds > 0)
+ int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+ cols_out_of_bounds[0] - cols_out_of_bounds[1];
+ if (cols_in_bounds > 0)
{
- Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
- is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
- Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
- is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+ Coordinates2D start_in{
+ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+ is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+ Coordinates2D end_in{
+ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+ is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
auto kernel = std::make_unique<CLCrop>();
- kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+ kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+ extrapolation_value, &slice_in);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
}
// Fill all rows after the in bounds elements with the extrapolation value.
- if(rows_out_of_bounds[1] > 0)
+ if (rows_out_of_bounds[1] > 0)
{
Window slice_fill_rows_after(full_window);
- slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+ slice_fill_rows_after.set(
+ 2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+ _crop_results[num_box].get()->info()->dimension(2), 1));
auto kernel = std::make_unique<CLFill>();
- kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+ &slice_fill_rows_after);
//_internal_functions.emplace_back(std::move(kernel));
_internal_functions.push_back(std::move(kernel));
}
@@ -274,21 +357,21 @@ void CLCropResize::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
- for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+ for (unsigned int i = 0; i < _internal_functions.size(); ++i)
{
_internal_functions[i]->run();
}
CLScheduler::get().sync();
- for(auto &kernel : _scale)
+ for (auto &kernel : _scale)
{
kernel->run();
}
CLScheduler::get().sync();
- for(auto &kernel : _copy)
+ for (auto &kernel : _copy)
{
kernel->run();
}
CLScheduler::get().sync();
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 918848745e..4e0d1501ba 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,18 @@
*/
#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClTransposedConvolution.h"
+
#include <cmath>
#include <memory>
#include <tuple>
@@ -36,26 +42,62 @@
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
+struct CLDeconvolutionLayer::Impl
+{
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::IClOperator> op{nullptr};
+};
+
+CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
+
CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _function()
+ : _memory_manager(std::move(memory_manager)), _function(), _impl(std::make_unique<Impl>())
{
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
}
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
- switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+ switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+ deconv_info, weights_info))
{
case DeconvolutionMethod::DIRECT:
{
+ auto op = std::make_unique<opencl::ClTransposedConvolution>();
+ op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+ output->info(), deconv_info);
+
+ _impl->src = input;
+ _impl->weights = weights;
+ _impl->biases = bias;
+ _impl->dst = output;
+
+ _impl->op = std::move(op);
+ break;
+ }
+ case DeconvolutionMethod::UPSCALE_CONV2D:
+ {
auto f = std::make_unique<CLDirectDeconvolutionLayer>();
f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info);
_function = std::move(f);
@@ -74,16 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC
}
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+ switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
{
case DeconvolutionMethod::DIRECT:
{
+ // Validate transposed convolution operator
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+ break;
+ }
+ case DeconvolutionMethod::UPSCALE_CONV2D:
+ {
// Validate direct convolution layer
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
break;
}
case DeconvolutionMethod::GEMM:
@@ -100,24 +154,40 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
return Status{};
}
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_UNUSED(output, bias, weights_info);
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
- return DeconvolutionMethod::DIRECT;
+ return DeconvolutionMethod::UPSCALE_CONV2D;
}
const DataLayout data_layout = input->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ const size_t ofm = weights->tensor_shape()[idx_n];
- if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+ if (weights->dimension(idx_w) != deconv_info.stride().first ||
+ weights->dimension(idx_h) != deconv_info.stride().second)
{
- return DeconvolutionMethod::DIRECT;
+ // We observe better performance for FP32 types only when ofm <= 16, and for FP16 only when ofm <= 32.
+ if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)) &&
+ !((input->data_type() == DataType::F16) && (ofm > 32)))
+ {
+ return DeconvolutionMethod::DIRECT;
+ }
+ else
+ {
+ return DeconvolutionMethod::UPSCALE_CONV2D;
+ }
}
return DeconvolutionMethod::GEMM;
@@ -126,10 +196,29 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor
void CLDeconvolutionLayer::run()
{
prepare();
- _function->run();
+
+ if (_impl->op != nullptr)
+ {
+ // Optimized Operator will be used
+ ITensorPack pack;
+
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+ _impl->op->run(pack);
+ }
+ else
+ {
+ _function->run();
+ }
}
void CLDeconvolutionLayer::prepare()
{
- _function->prepare();
+ if (_impl->op == nullptr)
+ {
+ _function->prepare();
+ }
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index c371558f30..b92bf903a6 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,20 +27,21 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
namespace arm_compute
{
CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
- : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
- _fill(),
- _output(nullptr)
+ : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
{
}
CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
{
return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
}
@@ -50,12 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, info);
_output = output;
- _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+ _fill.configure(compile_context, _output,
+ PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
_upsample->configure(compile_context, input, _output, info);
}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 6aa370b23c..6d2fea974e 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -26,8 +26,10 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCast.h"
+#include "src/gpu/cl/operators/ClCast.h"
#include <utility>
@@ -35,16 +37,15 @@ namespace arm_compute
{
struct CLDepthConvertLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClCast> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClCast> op{nullptr};
};
-CLDepthConvertLayer::CLDepthConvertLayer()
- : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
{
}
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
CLDepthConvertLayer::~CLDepthConvertLayer() = default;
@@ -53,9 +54,14 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C
configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
}
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ uint32_t shift)
{
ARM_COMPUTE_UNUSED(shift);
+ ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
_impl->src = input;
_impl->dst = output;
@@ -67,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
}
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
return opencl::ClCast::validate(input, output, policy);
@@ -75,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
void CLDepthConvertLayer::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index bd2303c410..9477c7f81d 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
#include <utility>
@@ -34,8 +35,12 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
k->configure(compile_context, input, output, block_shape);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 6467caffef..873601bb11 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,86 +25,23 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
namespace arm_compute
{
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_dwc;
-namespace
-{
-Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const bool needs_permute = is_nhwc && (depth_multiplier > 1);
-
- ARM_COMPUTE_RETURN_ERROR_ON(is_quantized && is_nhwc && !needs_permute);
-
- TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
- if(is_quantized)
- {
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-
- const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
- output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- }
- }
-
- if(needs_permute)
- {
- TensorShape permuted_input_shape = input->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-
- permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
- permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
- permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
-
- const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
- const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
- const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
- conv_info, depth_multiplier, act_info,
- dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
- }
- else if(is_nhwc)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info,
- dilation));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info,
- dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
- }
- return Status{};
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_dwc_native_kernel(std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
_permute_input_to_nhwc(),
@@ -126,25 +63,34 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv
CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+ act_info, dilation);
}
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
- ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- depth_multiplier,
- act_info,
- dilation));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+ output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
_is_quantized = is_data_type_quantized(input->info()->data_type());
_is_prepared = false;
@@ -153,10 +99,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
_output = output;
_needs_permute = input->info()->data_layout() == DataLayout::NCHW;
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
_memory_group.manage(&_permuted_output);
@@ -179,10 +127,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
CLTensor *output_multipliers_to_use = nullptr;
CLTensor *output_shifts_to_use = nullptr;
- if(_is_quantized)
+ if (_is_quantized)
{
- const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+ const size_t idx_c =
+ get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t num_filters =
+ (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
_output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
_output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -191,15 +141,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
output_shifts_to_use = &_output_shifts;
}
- DWCWeightsKernelInfo dwc_weights_info;
- dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
- DWCKernelInfo dwc_info;
- dwc_info.activation_info = act_info;
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+
+ const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
+
+ _dwc_native_kernel->set_target(gpu_target);
_dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
- dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
- output_multipliers_to_use, output_shifts_to_use);
+ dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+ output_shifts_to_use);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
@@ -209,37 +163,51 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
_permuted_output.allocator()->allocate();
}
- if(_is_quantized)
+ if (_is_quantized)
{
_output_multipliers.allocator()->allocate();
_output_shifts.allocator()->allocate();
}
}
-Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+
+ const bool in_place = input == output || output == nullptr;
+ if (in_place)
+ {
+ output = input;
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+ input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+ input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
- DWCWeightsKernelInfo dwc_weights_info;
- dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
- DWCKernelInfo dwc_info;
- dwc_info.activation_info = act_info;
+ const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
const bool needs_permute = input->data_layout() == DataLayout::NCHW;
const bool is_quantized = is_data_type_quantized(input->data_type());
TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
- if(is_quantized)
+ if (is_quantized)
{
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
@@ -252,73 +220,95 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
}
}
- if(needs_permute)
+ if (needs_permute)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+ const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+ TensorShape permuted_output_shape =
+ shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
- const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
- const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
- const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_input = input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_input_shape)
+ .set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_weights = weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_weights_shape)
+ .set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_output = output->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_output_shape)
+ .set_data_layout(DataLayout::NHWC);
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info,
- dwc_info, conv_info, depth_multiplier, dilation,
- &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+ &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+ &output_multipliers_shifts_info, &output_multipliers_shifts_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier,
- dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info =
+ t->configure(input, weights, conv_info, dilation, depth_multiplier);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+ input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+ &output_multipliers_shifts_info));
}
return Status{};
}
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
+void CLDepthwiseConvolutionLayer::run()
{
prepare();
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_to_nhwc.run();
}
CLScheduler::get().enqueue(*_dwc_native_kernel);
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_to_nchw.run();
}
}
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
+void CLDepthwiseConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- if(_is_quantized)
+ if (_is_quantized)
{
_output_multipliers.map();
_output_shifts.map();
- const unsigned int idx_ofms = _needs_permute ? 2 : 0;
- quantization::compute_quantized_multipliers_and_shifts(_input->info(),
- _original_weights->info(),
- _output->info(),
- idx_ofms,
- reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
- reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+ quantization::compute_quantized_multipliers_and_shifts(
+ _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+ reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+ reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
_output_multipliers.unmap();
_output_shifts.unmap();
}
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
@@ -329,305 +319,4 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
_is_prepared = true;
}
}
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)),
- _kernel_nchw(nullptr),
- _kernel_nhwc(nullptr),
- _border_handler(std::make_unique<CLFillBorderKernel>()),
- _permute_input_to_nchw(),
- _permute_weights_to_nchw(),
- _permute_output_to_nhwc(),
- _permuted_input(),
- _permuted_weights(),
- _permuted_output(),
- _output_multipliers(),
- _output_shifts(),
- _original_weights(nullptr),
- _input(nullptr),
- _output(nullptr),
- _needs_permute(false),
- _is_prepared(false),
- _is_quantized(false),
- _is_nhwc(false)
-{
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
- ICLTensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- depth_multiplier,
- act_info,
- dilation));
-
- _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _needs_permute = _is_nhwc && (depth_multiplier > 1);
-
- _is_prepared = false;
- _original_weights = weights;
- _input = input;
- _output = output;
-
- ICLTensor *input_to_use = input;
- const ICLTensor *weights_to_use = weights;
- ICLTensor *output_to_use = output;
-
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-
- if(_needs_permute)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
- input_to_use = &_permuted_input;
- weights_to_use = &_permuted_weights;
- output_to_use = &_permuted_output;
-
- _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
- }
- else if(_is_nhwc)
- {
- _kernel_nhwc = std::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
- }
- else
- {
- _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
- }
-
- CLTensor *output_multipliers_to_use = nullptr;
- CLTensor *output_shifts_to_use = nullptr;
- if(_is_quantized)
- {
- const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
-
- _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
- _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
- output_multipliers_to_use = &_output_multipliers;
- output_shifts_to_use = &_output_shifts;
- }
-
- // Configure kernel
- if(_is_nhwc && !_needs_permute)
- {
- _kernel_nhwc->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
- act_info, dilation);
- }
- else
- {
- _kernel_nchw->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
- act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
- }
-
- if(_is_quantized)
- {
- _output_multipliers.allocator()->allocate();
- _output_shifts.allocator()->allocate();
- }
-
- // Permute output if needed
- if(_needs_permute)
- {
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- // Configure border handler
- PixelValue &&zero_value(0.f);
- if(is_data_type_quantized_asymmetric(input->info()->data_type()))
- {
- zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
- }
- if(!_is_nhwc || _needs_permute)
- {
- _border_handler->configure(compile_context, input_to_use, _kernel_nchw->border_size(), BorderMode::CONSTANT, zero_value);
- }
-}
-
-Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_needs_permute)
- {
- _permute_input_to_nchw.run();
- }
- CLScheduler::get().enqueue(*_border_handler);
- if(_is_nhwc && !_needs_permute)
- {
- CLScheduler::get().enqueue(*_kernel_nhwc);
- }
- else
- {
- CLScheduler::get().enqueue(*_kernel_nchw);
- }
-
- if(_needs_permute)
- {
- _permute_output_to_nhwc.run();
- }
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
-{
- if(!_is_prepared)
- {
- if(_is_quantized)
- {
- _output_multipliers.map();
- _output_shifts.map();
- const unsigned int idx_ofms = _is_nhwc ? 0 : 2;
- quantization::compute_quantized_multipliers_and_shifts(_input->info(),
- _original_weights->info(),
- _output->info(),
- idx_ofms,
- reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
- reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
- _output_multipliers.unmap();
- _output_shifts.unmap();
- }
-
- if(_needs_permute)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _permuted_weights.allocator()->allocate();
- _permute_weights_to_nchw.run();
- _original_weights->mark_as_unused();
- }
-
- _is_prepared = true;
- }
-}
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic()
-{
-}
-
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- ActivationLayerInfo act_info, const Size2D &dilation)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info, const Size2D &dilation)
-{
- _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
- dilation);
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_3x3.set_memory_group(_memory_manager);
- _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- {
- _func_generic.set_memory_group(_memory_manager);
- _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
- }
-}
-
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- switch(depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- case DepthwiseConvolutionFunction::GENERIC:
- return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- default:
- ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
- }
-}
-
-DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
- if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
- {
- return DepthwiseConvolutionFunction::OPTIMIZED;
- }
- else
- {
- return DepthwiseConvolutionFunction::GENERIC;
- }
-}
-
-void CLDepthwiseConvolutionLayer::run()
-{
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_3x3.run();
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.run();
- break;
- default:
- ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
- }
-}
-
-void CLDepthwiseConvolutionLayer::prepare()
-{
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_3x3.prepare();
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.prepare();
- break;
- default:
- ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
- }
-}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 3b104017e7..20162a03db 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -26,20 +26,21 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClDequantize.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
namespace arm_compute
{
struct CLDequantizationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDequantize> op{nullptr};
};
-CLDequantizationLayer::CLDequantizationLayer()
- : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
{
}
CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -49,8 +50,11 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
configure(CLKernelLibrary::get().get_compile_context(), input, output);
}
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 907e69d8d7..d6dae0d732 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,37 +28,49 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
namespace arm_compute
{
struct CLDirectConvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
};
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
- : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
{
}
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default;
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
}
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
_impl->src = input;
_impl->weights = weights;
@@ -66,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClDirectConv2d>();
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
}
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -84,4 +101,4 @@ void CLDirectConvolutionLayer::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 8d1a91e420..7cd268ab0b 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,12 +26,13 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <memory>
@@ -54,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
{
}
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -66,23 +72,25 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+ weights->dimension(idx_w), weights->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- if(input->data_type() != weights->data_type())
+ if (input->data_type() != weights->data_type())
{
- ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+ !is_data_type_quantized_asymmetric(input->data_type()));
}
- if(bias != nullptr)
+ if (bias != nullptr)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -101,26 +109,42 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
unsigned int deconv_pad_y = 0;
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+ out_dims, deconv_pad_x, deconv_pad_y);
+ TensorInfo scale_out_info(input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
return Status{};
}
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
}
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
const unsigned int pad_left = info.pad_left();
const unsigned int pad_right = info.pad_right();
@@ -137,17 +161,21 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_original_weights = weights;
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+ _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, /* use_inverted_axis */ false);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+ auto out_dims =
+ deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+ auto_init_if_empty(*output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
_is_prepared = weights_info.retain_internal_weights();
@@ -156,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
unsigned int deconv_pad_x = 0;
unsigned int deconv_pad_y = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+ *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0;
unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -177,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_scaled_output.allocator()->init(scale_out_info);
// configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+ const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+ deconv_pad_bottom, DimensionRoundingType::FLOOR);
_scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
// Setup the function to convolve the upscaled output
@@ -189,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
_flip_axis.allocator()->allocate();
_flip_axis.map(true);
auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
- if(weights->info()->data_layout() == DataLayout::NHWC)
+ if (weights->info()->data_layout() == DataLayout::NHWC)
{
axis_data[0] = 1;
axis_data[1] = 2;
@@ -214,7 +244,7 @@ void CLDirectDeconvolutionLayer::run()
void CLDirectDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
@@ -227,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare()
_conv_f.prepare();
// Free flipped weights
- if(!_weights_flipped.is_used())
+ if (!_weights_flipped.is_used())
{
_weights_flipped.allocator()->free();
}
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 60c699cbb8..d9529f0b7f 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -26,36 +26,40 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClAdd.h"
-#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h"
-#include "src/runtime/gpu/cl/operators/ClSub.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+#include "src/gpu/cl/operators/ClSub.h"
namespace arm_compute
{
struct CLArithmeticAddition::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClAdd> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClAdd> op{nullptr};
};
-CLArithmeticAddition::CLArithmeticAddition()
- : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
CLArithmeticAddition::~CLArithmeticAddition() = default;
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+ ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
}
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
@@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
}
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
}
@@ -82,26 +90,33 @@ void CLArithmeticAddition::run()
struct CLArithmeticSubtraction::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClSub> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClSub> op{nullptr};
};
-CLArithmeticSubtraction::CLArithmeticSubtraction()
- : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
CLArithmeticSubtraction::~CLArithmeticSubtraction() = default;
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
}
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ ConvertPolicy policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
@@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
}
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClSub::validate(input1, input2, output, policy, act_info);
}
@@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run()
struct CLArithmeticDivision::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
};
-CLArithmeticDivision::CLArithmeticDivision()
- : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
{
}
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
CLArithmeticDivision::~CLArithmeticDivision() = default;
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
}
@@ -173,26 +201,32 @@ void CLArithmeticDivision::run()
struct CLElementwiseMax::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
};
-CLElementwiseMax::CLElementwiseMax()
- : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
CLElementwiseMax::~CLElementwiseMax() = default;
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
}
@@ -218,26 +255,32 @@ void CLElementwiseMax::run()
struct CLElementwiseMin::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
};
-CLElementwiseMin::CLElementwiseMin()
- : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
CLElementwiseMin::~CLElementwiseMin() = default;
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
}
@@ -263,26 +309,32 @@ void CLElementwiseMin::run()
struct CLElementwiseSquaredDiff::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
};
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
- : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
{
}
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff() = default;
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
}
@@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run()
struct CLElementwisePower::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
};
-CLElementwisePower::CLElementwisePower()
- : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
{
}
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
CLElementwisePower::~CLElementwisePower() = default;
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index a45dd6f9a6..3043c26feb 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -25,24 +25,24 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h"
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
namespace arm_compute
{
struct CLRsqrtLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClRsqrt> op{nullptr};
};
-CLRsqrtLayer::CLRsqrtLayer()
- : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
{
}
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
CLRsqrtLayer::~CLRsqrtLayer() = default;
@@ -74,17 +74,16 @@ void CLRsqrtLayer::run()
struct CLExpLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClExp> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClExp> op{nullptr};
};
-CLExpLayer::CLExpLayer()
- : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
{
}
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
CLExpLayer::~CLExpLayer() = default;
@@ -116,17 +115,16 @@ void CLExpLayer::run()
struct CLNegLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClNeg> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClNeg> op{nullptr};
};
-CLNegLayer::CLNegLayer()
- : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
{
}
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
CLNegLayer::~CLNegLayer() = default;
@@ -157,17 +155,16 @@ void CLNegLayer::run()
struct CLSinLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClSin> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClSin> op{nullptr};
};
-CLSinLayer::CLSinLayer()
- : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
{
}
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
CLSinLayer::~CLSinLayer() = default;
@@ -198,17 +195,16 @@ void CLSinLayer::run()
struct CLAbsLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClAbs> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClAbs> op{nullptr};
};
-CLAbsLayer::CLAbsLayer()
- : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
{
}
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
CLAbsLayer::~CLAbsLayer() = default;
@@ -239,17 +235,16 @@ void CLAbsLayer::run()
struct CLLogLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClLog> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClLog> op{nullptr};
};
-CLLogLayer::CLLogLayer()
- : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
{
}
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
CLLogLayer::~CLLogLayer() = default;
@@ -280,17 +275,16 @@ void CLLogLayer::run()
struct CLRoundLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClRound> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClRound> op{nullptr};
};
-CLRoundLayer::CLRoundLayer()
- : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
{
}
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
CLRoundLayer::~CLRoundLayer() = default;
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index cf136dc75e..48e9ae824a 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -52,10 +54,14 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT1DInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Decompose size to radix factors
const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
@@ -74,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
_digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
_memory_group.manage(&_digit_reversed_input);
- _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+ _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+ digit_reverse_config);
// Create and configure FFT kernels
unsigned int Nx = 1;
_num_ffts = decomposed_vector.size();
_fft_kernels.reserve(_num_ffts);
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -90,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
fft_kernel_info.Nx = Nx;
fft_kernel_info.is_first_stage = (i == 0);
_fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
- _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+ _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+ ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
Nx *= radix_for_stage;
}
// Configure scale kernel
- if(_run_scale)
+ if (_run_scale)
{
FFTScaleKernelInfo scale_config;
scale_config.scale = static_cast<float>(N);
scale_config.conjugate = config.direction == FFTDirection::Inverse;
- is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+ is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+ : _scale_kernel->configure(output, nullptr, scale_config);
}
// Allocate tensors
@@ -120,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
// Check if FFT is decomposable
const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
@@ -129,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -148,13 +157,13 @@ void CLFFT1D::run()
CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
// Run radix kernels
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
}
// Run output scaling
- if(_run_scale)
+ if (_run_scale)
{
CLScheduler::get().enqueue(*_scale_kernel, true);
}
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index e0497ca6dc..3857046719 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -33,7 +35,10 @@
namespace arm_compute
{
CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+ : _memory_group(memory_manager),
+ _first_pass_func(memory_manager),
+ _second_pass_func(memory_manager),
+ _first_pass_tensor()
{
}
@@ -44,10 +49,14 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
}
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const FFT2DInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Setup first pass
FFT1DInfo first_pass_config;
@@ -85,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index 41b02d03f2..2a73517549 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,12 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
#include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -48,11 +50,11 @@ int pad_decomposable(int N)
int pad = 0;
bool is_decomposed = false;
- while(!is_decomposed)
+ while (!is_decomposed)
{
const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
is_decomposed = !decomposed_vector.empty();
- if(!is_decomposed)
+ if (!is_decomposed)
{
++pad;
}
@@ -102,17 +104,32 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
{
}
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+ enable_fast_math);
}
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
- ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(), conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_original_weights = weights;
_original_bias = biases;
@@ -121,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_has_bias = biases != nullptr;
// Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
- const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
- pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ const Size2D input_dims =
+ Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size =
+ Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
// Tensors to use
ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = _has_bias ? &_bias_output : output;
// Permute bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
_permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
_permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -143,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Permute input if needed
_needs_permute = input->info()->data_layout() == DataLayout::NHWC;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -161,10 +181,11 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Flip weights
_flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
- _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
+ _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis,
+ /* use_inverted_axis */ false);
// Pad weights
- const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
_pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
// Transform weights
@@ -172,10 +193,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
// Pad input
- const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
_memory_group.manage(&_padded_input);
_pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
}
@@ -199,7 +220,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_memory_group.manage(&_itransformed_output);
FFT2DInfo itranform_info;
itranform_info.direction = FFTDirection::Inverse;
- _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransformed_output.allocator()->init(
+ _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
_itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
_output_reduced.allocator()->allocate();
@@ -211,25 +233,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Extract correct region
const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
- const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
- const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
- if(_has_bias)
+ const int end_right =
+ _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton =
+ _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if (_has_bias)
{
_memory_group.manage(&_bias_output);
}
- else if(_needs_permute)
+ else if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
}
- _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+ Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
_itransformed_output.allocator()->allocate();
// Add bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
@@ -240,7 +265,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
}
// Permute output
- if(_needs_permute)
+ if (_needs_permute)
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -252,7 +277,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
// Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.configure(compile_context, output, nullptr, act_info);
}
@@ -266,8 +291,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
_flip_axis.unmap();
}
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -284,24 +314,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
const auto strides = conv_info.stride();
ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+ conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+ conv_info.pad_bottom() != (kernel_size.y() / 2));
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+ (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
// Validate Activation Layer
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
}
@@ -317,7 +350,7 @@ void CLFFTConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Transform input
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_func.run();
}
@@ -333,17 +366,17 @@ void CLFFTConvolutionLayer::run()
_reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
_extract_output_func.run();
// Add bias
- if(_has_bias)
+ if (_has_bias)
{
_bias_add_func.run();
}
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_func.run();
}
// Run activation layer
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.run();
}
@@ -351,10 +384,10 @@ void CLFFTConvolutionLayer::run()
void CLFFTConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Permute bias to NCHW
- if(_original_bias != nullptr)
+ if (_original_bias != nullptr)
{
_permuted_bias.allocator()->allocate();
_permute_bias_func.run();
@@ -363,7 +396,7 @@ void CLFFTConvolutionLayer::prepare()
const ICLTensor *cur_weights = _original_weights;
// Permute weights
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index b22d79fea4..9bd96a975e 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -27,8 +27,9 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClFill.h"
+#include "src/gpu/cl/operators/ClFill.h"
#include <utility>
@@ -36,16 +37,15 @@ namespace arm_compute
{
struct CLFill::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFill> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFill> op{nullptr};
};
-CLFill::CLFill()
- : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
{
}
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&) = default;
CLFill &CLFill::operator=(CLFill &&) = default;
CLFill::~CLFill() = default;
@@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind
configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
}
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+ ICLTensor *tensor,
+ const PixelValue &constant_value,
+ Window *dst_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
deleted file mode 100644
index 2e5a29ece1..0000000000
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
- configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value);
-}
-
-void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
- auto k = std::make_unique<CLFillBorderKernel>();
- k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value);
- _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 9563055276..ba1b5372d3 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -26,26 +26,26 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/gpu/cl/operators/ClFlatten.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
namespace arm_compute
{
struct CLFlattenLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFlatten> op{nullptr};
};
-CLFlattenLayer::CLFlattenLayer()
- : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
{
}
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
CLFlattenLayer::~CLFlattenLayer() = default;
@@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_impl->src = input;
_impl->dst = output;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_flatten_shape(input->info())));
_impl->op = std::make_unique<opencl::ClFlatten>();
_impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
}
return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@ void CLFlattenLayer::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 4c5e482b10..4322219dd9 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -27,23 +27,23 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClFloor.h"
+#include "src/gpu/cl/operators/ClFloor.h"
namespace arm_compute
{
struct CLFloor::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClFloor> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClFloor> op{nullptr};
};
-CLFloor::CLFloor()
- : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
{
}
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&) = default;
CLFloor &CLFloor::operator=(CLFloor &&) = default;
CLFloor::~CLFloor() = default;
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 31c8908270..b30f9e701f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,472 +23,137 @@
*/
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
-#include "support/Cast.h"
-#include <algorithm>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClFullyConnected.h"
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::utils::cast;
+using namespace arm_compute::experimental;
-namespace
+struct CLFullyConnectedLayer::Impl
{
-Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
-{
- gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage.gemmlowp_offset = 0;
- gemmlowp_output_stage.gemmlowp_multiplier = 0;
- gemmlowp_output_stage.gemmlowp_shift = 0;
-
- const auto data_type = input.data_type();
-
- // Configure output stage for quantized case
- if(is_data_type_quantized_asymmetric(data_type))
- {
- const QuantizationInfo oq_info = output.quantization_info();
- const UniformQuantizationInfo iq_unif = input.quantization_info().uniform();
- const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
- const UniformQuantizationInfo oq_unif = oq_info.uniform();
-
- const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif;
-
- const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
-
- if(activation_info.enabled())
- {
- std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
- }
-
- // Set the GEMMLowp output stage info
- gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
- gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
- gemmlowp_output_stage.gemmlowp_shift = output_shift;
- gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
- gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
- type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
- type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
- }
-
- return Status{};
-}
-
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
-{
- GEMMLowpOutputStageInfo gemmlowp_output_stage;
- ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info));
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
- const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
- false, // is_b_reshaped
- true, // reshape_b_only_on_first_run
- 0, // depth_output_gemm3d
- false, // reinterpret_input_as_3d
- fc_info.retain_internal_weights, // retain_internal_weights
- gemmlowp_output_stage, // gemmlowp_output_stage
- fc_info.fp_mixed_precision, // fp_mixed_precision
- true, // broadcast_bias
- ActivationLayerInfo()); // activation_info
+ std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
- if(is_data_type_quantized_asymmetric(input.data_type()))
- {
- const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
-
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
- const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+ const ITensor *original_weights{nullptr};
- // Validate gemmlowp function
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
- &weights.clone()->set_quantization_info(weights_quantization_info),
- bias,
- &output,
- gemm_info));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
- }
+ ITensorPack run_pack{};
+ WorkspaceData<CLTensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
- return Status{};
-}
-} // namespace
+ bool is_prepared{false};
+ bool dynamic_weights{false};
+};
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(),
- _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true),
- _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+ _impl->weights_manager = weights_manager;
}
-void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const FullyConnectedLayerInfo &fc_info)
-{
- GEMMLowpOutputStageInfo gemmlowp_output_stage;
- construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info);
-
- const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
- false, // is_b_reshaped
- true, // reshape_b_only_on_first_run
- 0, // depth_output_gemm3d
- false, // reinterpret_input_as_3d
- fc_info.retain_internal_weights, // retain_internal_weights
- gemmlowp_output_stage, // gemmlowp_output_stage
- fc_info.fp_mixed_precision, // fp_mixed_precision
- true, // broadcast_bias
- fc_info.activation_info, // activation_info
- fc_info.constant_weights); // constant_weights
-
- if(_is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info = input->info()->quantization_info();
- const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
- // Configure gemmlowp function
- _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info);
-
- // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
- input->info()->set_quantization_info(input_quantization_info);
- weights->info()->set_quantization_info(weights_quantization_info);
- }
- else
- {
- // Configure matrix multiply kernel
- _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info);
- }
-}
-
-void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const FullyConnectedLayerInfo &fc_info)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for flatten
- TensorShape shape_flatten = compute_flatten_shape(input->info());
- _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW));
-
- // Configure flatten kernel
- _memory_group.manage(&_flatten_output);
- _flatten_layer.configure(compile_context, input, &_flatten_output);
-
- // Configure matrix multiply kernel
- configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info);
-
- // Allocate the output tensor for flatten once all the configure methods have been called
- _flatten_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const FullyConnectedLayerInfo &fc_info)
-{
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
- // Configure matrix multiply kernel
- configure_mm(compile_context, input, weights, bias, output, fc_info);
-}
-
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
FullyConnectedLayerInfo fc_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
}
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
FullyConnectedLayerInfo fc_info)
{
+ // Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- fc_info));
+ _impl->op = std::make_unique<opencl::ClFullyConnected>();
+ _impl->original_weights = weights;
+ _impl->is_prepared = fc_info.retain_internal_weights;
- _are_weights_converted = true;
- _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- _is_fc_after_conv = true;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _is_prepared = fc_info.retain_internal_weights;
- _original_weights = weights;
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
- if(_weights_manager)
+ if (_impl->weights_manager != nullptr)
{
- _weights_manager->manage(weights);
+ _impl->weights_manager->manage(_impl->original_weights);
}
- const ICLTensor *weights_to_use = weights;
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
- if(is_batched_fc_layer)
+ if (!_impl->is_prepared)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->workspace =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
else
{
- _is_fc_after_conv = input->info()->num_dimensions() > 1;
- }
-
- // Reshape weights if needed
- if(!_are_weights_reshaped)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed_function.configure(compile_context, weights);
- weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
- }
- else
- {
- // Reshape the weights
- _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output);
- weights_to_use = &_reshape_weights_output;
- }
- }
-
- // Convert weights if needed
- if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
- {
- if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
- {
- _convert_weights_managed.configure(compile_context, weights_to_use,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
- weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
- }
- else
- {
- // Convert weights
- _convert_weights.configure(compile_context, weights_to_use,
- &_converted_weights_output,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
-
- weights_to_use = &_converted_weights_output;
- }
- _are_weights_converted = false;
+ _impl->run_pack.add_tensor(ACL_SRC_0, input);
+ _impl->run_pack.add_tensor(ACL_DST, output);
}
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info);
- }
+ _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+ !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
}
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
FullyConnectedLayerInfo fc_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
- && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
- ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
-
- bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- bool is_fc_after_conv = true;
-
- const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
- const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
- const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- const ITensorInfo *input_to_use = input;
- const ITensorInfo *weights_to_use = weights;
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->dimension(1) > 1;
- if(is_batched_fc_layer)
- {
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
- input->tensor_shape().cend(),
- output->tensor_shape().cbegin() + 1));
- }
- else
- {
- is_fc_after_conv = input->num_dimensions() > 1;
- }
-
- if(!weights_reshaped)
- {
- // Validate reshape weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights, &reshaped_weights));
- weights_to_use = &reshaped_weights;
- }
-
- if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
- {
- // Validate convert weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use,
- &converted_weights,
- input->tensor_shape(),
- fc_info.weights_trained_layout));
- weights_to_use = &converted_weights;
- }
-
- if(is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
- // Validate flatten kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
- input_to_use = &flatten_input;
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
- }
-
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
-
- return Status{};
+ return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
}
void CLFullyConnectedLayer::run()
{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Linearize input if it comes from a convolutional layer
- if(_is_fc_after_conv)
+ if (!_impl->dynamic_weights)
{
- _flatten_layer.run();
+ prepare();
}
- // Run matrix multiply
- if(_is_quantized)
- {
- _mm_gemmlowp.run();
- }
- else
- {
- _mm_gemm.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void CLFullyConnectedLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(!_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- }
-
- auto release_unused = [](CLTensor * w)
- {
- if(!w->is_used())
- {
- CLScheduler::get().queue().finish();
- w->allocator()->free();
- }
- };
+ _impl->op->prepare(_impl->run_pack);
- // Pointer to current weights
- const ICLTensor *cur_weights = _original_weights;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
- // Reshape of the weights if needed (happens only once)
- if(!_are_weights_reshaped)
+ // Handle weights managed infrastructure
+ if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+ // This is for cases where multiple functions share the same b (weights)
+ // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+ const ITensor *original_b = _impl->original_weights;
+ if (!original_b->is_used())
{
- cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+ _impl->weights_manager->pre_mark_as_unused(original_b);
}
- else
- {
- // Run reshape weights kernel and mark weights as unused
- _reshape_weights_output.allocator()->allocate();
- _reshape_weights_function.run();
-
- cur_weights->mark_as_unused();
- cur_weights = &_reshape_weights_output;
- }
- _are_weights_reshaped = true;
+ _impl->original_weights->mark_as_used();
+ _impl->weights_manager->release(_impl->original_weights);
}
-
- // Convert weights if needed (happens only once)
- if(!_are_weights_converted)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
- {
- _weights_manager->run(cur_weights, &_convert_weights_managed);
- }
- else
- {
- _converted_weights_output.allocator()->allocate();
- _convert_weights.run();
- cur_weights->mark_as_unused();
- }
-
- _are_weights_converted = true;
- }
-
- // Release reshaped weights if unused
- release_unused(&_reshape_weights_output);
-
- // Prepare GEMM prepare and release unused weights
- if(!_is_quantized)
- {
- _mm_gemm.prepare();
- }
-
- // Release converted weights if unused
- release_unused(&_reshape_weights_output);
- release_unused(&_converted_weights_output);
-
- _is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 2945508012..e4fbf78e13 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
namespace arm_compute
@@ -39,28 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization()
CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
- ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input_weights,
+ const ICLTensor *bn_mean,
+ const ICLTensor *bn_var,
+ ICLTensor *fused_weights,
+ ICLTensor *fused_bias,
+ const ICLTensor *input_bias,
+ const ICLTensor *bn_beta,
+ const ICLTensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
+ _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+ bn_beta, bn_gamma, epsilon, fbn_type);
}
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 35126ec0d7..871a1d6e27 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,9 +30,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemm.h"
namespace arm_compute
{
@@ -41,19 +41,15 @@ using OperatorType = opencl::ClGemm;
struct CLGEMM::Impl
{
- const ICLTensor *a{ nullptr };
- const ICLTensor *b{ nullptr };
- const ICLTensor *c{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *b{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
MemoryGroup memory_group{};
- IWeightsManager *weights_manager{ nullptr };
- CLTensor weights_transformed{};
+ IWeightsManager *weights_manager{nullptr};
ITensorPack run_pack{};
ITensorPack prep_pack{};
MemoryRequirements aux_mem_req{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -65,41 +61,59 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
CLGEMM::~CLGEMM() = default;
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
}
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- _impl->a = a;
_impl->b = b;
- _impl->c = c;
- _impl->dst = output;
_impl->op = std::make_unique<OperatorType>();
_impl->is_prepared = gemm_info.retain_internal_weights();
- _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+ _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+ alpha, beta, gemm_info);
_impl->aux_mem_req = _impl->op->workspace();
// Manage/allocate auxilairy tensors
- if(_impl->is_prepared)
+ if (_impl->is_prepared)
{
- _impl->run_pack.add_const_tensor(ACL_SRC_0, _impl->a);
- _impl->run_pack.add_tensor(ACL_DST, _impl->dst);
+ _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
+ _impl->run_pack.add_tensor(ACL_DST, output);
}
else
{
- _impl->run_pack = { { ACL_SRC_0, _impl->a }, { ACL_SRC_2, _impl->c }, { ACL_DST, _impl->dst } };
- _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
}
@@ -110,21 +124,20 @@ void CLGEMM::run()
MemoryGroupResourceScope scope_mg(_impl->memory_group);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->a, _impl->b, _impl->dst);
_impl->op->run(_impl->run_pack);
}
void CLGEMM::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
_impl->op->prepare(_impl->prep_pack);
- auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
- _impl->aux_mem_req.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- if(has_reshape != std::end(_impl->aux_mem_req))
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
_impl->b->mark_as_unused();
}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 3184d5dfe0..aef7cddd7a 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,22 +23,17 @@
*/
#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
#include "support/Cast.h"
#include <cmath>
@@ -49,634 +44,117 @@ namespace arm_compute
{
using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::utils::cast;
+using namespace arm_compute::experimental;
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(std::make_unique<CLWeightsReshapeKernel>())
-{
-}
-
-CLConvolutionLayerReshapeWeights::~CLConvolutionLayerReshapeWeights() = default;
-
-void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
-}
-
-void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(),
- num_groups));
-
- const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
-
- _weights_reshape_kernel->configure(compile_context, weights, biases_to_use, output, num_groups);
-
- output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+struct CLGEMMConvolutionLayer::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type()));
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
- CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
- }
-
- return Status{};
-}
-
-void CLConvolutionLayerReshapeWeights::run()
-{
- CLScheduler::get().enqueue(*_weights_reshape_kernel);
-}
-
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(std::make_unique<CLIm2ColKernel>()), _mm_gemm(memory_manager,
- weights_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(std::make_unique<CLCol2ImKernel>()), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(),
- _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
+ const ITensor *weights{nullptr};
+ std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<CLTensor> workspace_tensors{};
+ bool is_prepared{false};
+};
+
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(memory_manager);
+ _impl->weights_manager = weights_manager;
}
CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
-void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
-
- const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
- false, // is_b_reshaped
- true, // reshape_b_only_on_first_run
- gemm_3d_depth, // depth_output_gemm3d
- _skip_im2col, // reinterpret_input_as_3d
- false, // retain_internal_weights
- gemmlowp_output_stage, // gemmlowp_output_stage
- false, // fp_mixed_precision
- true, // broadcast_bias
- act_info); // activation_info
-
- if(_is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info = input->info()->quantization_info();
- const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
- _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info);
-
- // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(input_quantization_info);
- weights->info()->set_quantization_info(weights_quantization_info);
- }
- else
- {
- // Configure matrix multiply function
- _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info);
- }
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, num_groups);
}
-Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
-{
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-
- const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
- false, // is_b_reshaped
- true, // reshape_b_only_on_first_run
- gemm_3d_depth, // depth_output_gemm3d
- skip_im2col, // reinterpret_input_as_3d
- false, // retain_internal_weights
- gemmlowp_output_stage, // gemmlowp_output_stage
- false, // fp_mixed_precision
- true, // broadcast_bias
- act_info); // activation_info
-
- if(is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info = input->quantization_info();
- const QuantizationInfo weights_quantization_info = weights->quantization_info();
-
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
- weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
- // Perform validation step on GEMMLowp
- return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
- }
- else
- {
- // Perform validation step on Matrix multiply function
- return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
- }
-}
-
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
-}
-
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- weights_info,
- dilation,
- act_info,
- num_groups));
-
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->info()->dimension(idx_width);
- const unsigned int kernel_height = weights->info()->dimension(idx_height);
- const unsigned int num_kernels = weights->info()->dimension(idx_kernels);
-
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- _skip_col2im = data_layout == DataLayout::NHWC;
-
- // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
- _fuse_activation = true;
-
- // Set the GPU target for im2col and col2im
- _im2col_kernel->set_target(CLScheduler::get().target());
- _col2im_kernel->set_target(CLScheduler::get().target());
-
- const ICLTensor *gemm_input_to_use = input;
- ICLTensor *gemm_output_to_use = output;
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
- input->info()->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- unsigned int mat_weights_cols = num_kernels / num_groups;
-
- const ICLTensor *biases_to_use = biases;
- bool append_bias = false;
-
- ICLTensor *weights_to_use = &_weights_reshaped;
- if(num_groups != 1 && biases != nullptr)
- {
- // num_groups != 1 can only be for NCHW
- // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
- biases_to_use = nullptr;
- append_bias = true;
-
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed.configure(compile_context, weights, biases, num_groups);
- weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
- }
- else
- {
- _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups);
- }
- }
- else
- {
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups);
- weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
- }
- else
- {
- _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups);
- }
- }
-
- // Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
- {
- _memory_group.manage(&_im2col_output);
-
- // Configure and tune im2col. im2col output shape is auto-initialized
- _im2col_kernel->configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
-
- // Set quantization info
- _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
- CLScheduler::get().tune_kernel_static(*_im2col_kernel);
-
- // Update GEMM input
- gemm_input_to_use = &_im2col_output;
- }
-
- // Create GEMM output tensor
- if(!_skip_col2im)
- {
- TensorShape shape_gemm;
-
- // If we cannot skip col2im it means we run im2col as well
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
- TensorInfo info_gemm(shape_gemm, 1, data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
- _gemm_output.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output;
- }
-
- GEMMLowpOutputStageInfo gemmlowp_output_stage;
- gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage.gemmlowp_offset = 0;
-
- // Configure output stage for quantized case
- if(_is_quantized)
- {
- const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
- const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
-
- gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
- gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
- gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
- quantization::compute_quantized_multipliers_and_shifts(input->info(),
- weights->info(),
- output->info(),
- idx_kernels,
- gemmlowp_output_stage.gemmlowp_multipliers.data(),
- gemmlowp_output_stage.gemmlowp_shifts.data());
- gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
- gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
-
- PixelValue min_val{};
- PixelValue max_val{};
- std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-
- auto min_activation = min_val.get<int32_t>();
- auto max_activation = max_val.get<int32_t>();
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(act_info.enabled())
- {
- if(supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info);
- }
- else
- {
- _fuse_activation = false;
- }
- }
-
- // Set the GEMMLowp output stage info
- gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
- gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
- gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
- }
-
- // Configure and tune GEMM
- // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
- const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
- configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
-
- if(!_skip_im2col)
- {
- _im2col_output.allocator()->allocate();
- }
-
- if(!_skip_col2im)
- {
- // Configure and tune Col2Im
- _col2im_kernel->configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
- CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
- }
-
- if(!_skip_col2im)
- {
- _gemm_output.allocator()->allocate();
- }
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
- "Output shape does not match the expected one");
-
- if(!_fuse_activation)
- {
- _activationlayer_function.configure(compile_context, output, nullptr, act_info);
- }
-
- ARM_COMPUTE_UNUSED(weights_info);
+ _impl->weights = weights;
+ _impl->op = std::make_unique<opencl::ClGemmConv2d>();
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input},
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
+ {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
+ };
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
-
- if(!is_quantized_per_channel)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW));
-
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->dimension(idx_width);
- const unsigned int kernel_height = weights->dimension(idx_height);
- const unsigned int num_kernels = weights->dimension(idx_kernels);
-
- TensorInfo im2col_reshaped_info{};
- TensorInfo info_gemm{};
- TensorInfo weights_reshaped_info{};
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *weights_to_use = weights;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- const bool skip_col2im = data_layout == DataLayout::NHWC;
- bool fuse_activation = true;
-
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- // Validate biases
- if(biases != nullptr)
- {
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if(act_info.enabled())
- {
- ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
- }
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- unsigned int mat_weights_cols = num_kernels / num_groups;
-
- const ITensorInfo *biases_to_use = biases;
- bool append_bias = false;
-
- if(num_groups != 1 && biases != nullptr)
- {
- // num_groups != 1 can only be for NCHW
- // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
- biases_to_use = nullptr;
- append_bias = true;
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type);
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type);
- }
-
- weights_to_use = &weights_reshaped_info;
-
- if(!skip_im2col)
- {
- const Size2D kernel_dims(kernel_width, kernel_height);
-
- // Output tensor auto initialization if not yet initialized
- TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups);
-
- auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
- gemm_input_to_use = &im2col_reshaped_info;
- }
-
- // Create GEMM output tensor
- if(!skip_col2im)
- {
- TensorShape shape_gemm;
-
- shape_gemm = gemm_input_to_use->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
- info_gemm = TensorInfo(shape_gemm, 1, data_type);
- info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
- gemm_output_to_use = &info_gemm;
- }
-
- GEMMLowpOutputStageInfo gemmlowp_output_stage;
- gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage.gemmlowp_offset = 0;
- gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
- const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
- const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
-
- gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
- gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
- quantization::compute_quantized_multipliers_and_shifts(input,
- weights,
- output,
- idx_kernels,
- gemmlowp_output_stage.gemmlowp_multipliers.data(),
- gemmlowp_output_stage.gemmlowp_shifts.data());
- gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
- gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
-
- int min_activation = 0;
- int max_activation = 0;
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(act_info.enabled())
- {
- if(supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info);
- }
- else
- {
- fuse_activation = false;
- }
- }
-
- // Set the GEMMLowp output stage info
- gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
- gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
- gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
- }
-
- // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
- const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info));
-
- // Validate Col2Im
- if(!skip_col2im)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups));
- }
-
- //Validate Activation Layer
- if(!fuse_activation)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
- }
-
- return Status{};
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
+ return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
}
void CLGEMMConvolutionLayer::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Run im2col
- if(!_skip_im2col)
- {
- CLScheduler::get().enqueue(*_im2col_kernel);
- }
-
- // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
- if(_is_quantized)
- {
- // Run gemmlowp
- _mm_gemmlowp.run();
- }
- else
- {
- // Run gemm
- _mm_gemm.run();
- }
-
- // Reshape output matrix
- if(!_skip_col2im)
- {
- CLScheduler::get().enqueue(*_col2im_kernel.get(), false);
- }
-
- //Run Activation Layer if we cannot fuse in GEMM
- if(!_fuse_activation)
- {
- _activationlayer_function.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void CLGEMMConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ _impl->op->prepare(_impl->prep_pack);
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
- _weights_manager->run(_original_weights, &_reshape_weights_managed);
+ _impl->weights->mark_as_unused();
}
else
{
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
- }
-
- // Prepare GEMM
- _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
- if(!_weights_reshaped.is_used())
- {
- _weights_reshaped.allocator()->free();
+ // Pack the B matrix to be used as the underlying GEMM performs no reshapes
+ _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
}
-
- CLScheduler::get().queue().finish();
- _is_prepared = true;
+ release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index d5d1b5f41e..7d40cf1829 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,19 +24,15 @@
#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
#include <tuple>
@@ -44,12 +40,13 @@ namespace arm_compute
{
namespace
{
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
{
Coordinates start;
Coordinates end;
- if(is_nchw)
+ if (is_nchw)
{
start.set(0, deconv_info.pad_left());
start.set(1, deconv_info.pad_top());
@@ -67,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
}
- return { start, end };
+ return {start, end};
}
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ GEMMLowpOutputStageInfo &output_stage_info)
{
const auto data_type = input->data_type();
- if(is_data_type_quantized_asymmetric(data_type))
+ if (is_data_type_quantized_asymmetric(data_type))
{
const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -82,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier(0);
int output_shift(0);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -126,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
DataLayout data_layout = input->data_layout();
- const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+ deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
const bool is_nchw = input->data_layout() == DataLayout::NCHW;
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
@@ -148,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
TensorShape nhwc_weights_shape = weights->tensor_shape();
TensorShape nhwc_input_shape = input->tensor_shape();
- if(is_nchw)
+ if (is_nchw)
{
permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
permute(nhwc_input_shape, PermutationVector(2, 0, 1));
- TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+ TensorInfo nhwc_input_info = input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(nhwc_input_shape)
+ .set_data_layout(DataLayout::NCHW);
- TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+ TensorInfo nhwc_weights_info = weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(nhwc_weights_shape)
+ .set_data_layout(DataLayout::NCHW);
CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
}
- const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
- const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+ const TensorShape reshaped_shape =
+ TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+ const TensorInfo reshaped_info =
+ weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -170,76 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
- input->dimension(idx_w),
- input->dimension(idx_h),
- input->dimension(idx_b));
+ input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true);
GEMMLowpOutputStageInfo output_stage_info;
- if(is_quantized)
+ if (is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
- gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+ &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+ &gemm_output_info.set_data_type(DataType::S32), gemm_info));
ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+ &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
}
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
- const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
- TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+ weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+ const TensorShape deconv_shape =
+ misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
- if(padded_input && is_quantized)
+ if (padded_input && is_quantized)
{
const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+ &col2im_output_info, nullptr,
+ &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+ output, start_end.first, start_end.second));
}
- else if(padded_input)
+ else if (padded_input)
{
const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
}
- else if(is_quantized)
+ else if (is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+ &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
}
return Status{};
}
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
}
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
- const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
- weights->info(),
- bias != nullptr ? bias->info() : nullptr,
- output->info(),
- deconv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+ input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
_original_weights = weights;
- _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+ deconv_info.pad_top() > 0;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
const ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
@@ -248,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
// do an outer product in NCHW and then an accumulation through a reduction. This would have two
// drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
// might be slower than GEMM.
- if(_is_nchw)
+ if (_is_nchw)
{
_memory_group.manage(&_permuted_input);
_permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -260,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
// Reshape the input weights. The weights will be reshaped only once during the call to prepare()
- _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
- weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
- 1,
- input->info()->data_type(), weights->info()->quantization_info()));
+ _reshaped_weights.allocator()->init(
+ TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+ weights_to_use->info()->dimension(2) *
+ weights_to_use->info()->dimension(3)),
+ 1, input->info()->data_type(), weights->info()->quantization_info()));
_reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
_transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -272,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true);
// Configure output stage for asymmetric quantized types
- if(_is_quantized)
+ if (_is_quantized)
{
// gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
// and restore them back to make it work properly.
QuantizationInfo iq_info = input->info()->quantization_info();
QuantizationInfo wq_info = weights->info()->quantization_info();
- input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
- _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+ input_to_use->info()->set_quantization_info(
+ QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+ _reshaped_weights_t.info()->set_quantization_info(
+ QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
_mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
@@ -289,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
else
{
- _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+ _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+ gemm_info);
}
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_input.allocator()->allocate();
}
@@ -301,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
ICLTensor *slice_output = nullptr;
ICLTensor *output_stage_output = nullptr;
- if(_padded_input && _is_quantized)
+ if (_padded_input && _is_quantized)
{
_memory_group.manage(&_slice_gemm_input);
_memory_group.manage(&_gemmlowp_final);
@@ -309,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
output_stage_output = &_slice_gemm_input;
slice_output = output;
}
- else if(_padded_input)
+ else if (_padded_input)
{
_memory_group.manage(&_slice_gemm_input);
deconv_reshape_output = &_slice_gemm_input;
slice_output = output;
}
- else if(_is_quantized)
+ else if (_is_quantized)
{
_memory_group.manage(&_gemmlowp_final);
deconv_reshape_output = &_gemmlowp_final;
@@ -327,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
}
// Configure a Col2Im call to reshape the output of GEMM
- _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+ _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+ weights->info(), deconv_info);
_gemm_output.allocator()->allocate();
- if(_is_quantized)
+ if (_is_quantized)
{
GEMMLowpOutputStageInfo output_stage_info;
construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
- _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+ _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+ output_stage_info);
_gemmlowp_final.allocator()->allocate();
}
// If the input was padded, the output needs to be sliced.
- if(_padded_input)
+ if (_padded_input)
{
- const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+ const auto start_end =
+ compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
_slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
_slice_gemm_input.allocator()->allocate();
}
@@ -353,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input_to_nhwc.run();
}
- if(_is_quantized)
+ if (_is_quantized)
{
_mm_gemmlowp.run();
}
@@ -369,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run()
CLScheduler::get().enqueue(*_deconv_reshape, false);
- if(_is_quantized)
+ if (_is_quantized)
{
_gemmlowp_output_stage.run();
}
- if(_padded_input)
+ if (_padded_input)
{
_slice_gemm.run();
}
@@ -382,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run()
void CLGEMMDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_weights.allocator()->allocate();
_permute_weights_to_nhwc.run();
@@ -395,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare()
_reshaped_weights.allocator()->allocate();
_reshape_weights.run();
- if(_is_nchw)
+ if (_is_nchw)
{
_permuted_weights.allocator()->free();
}
@@ -404,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare()
_transpose_weights.run();
// Prepare gemm
- if(!_is_quantized)
+ if (!_is_quantized)
{
_mm_gemm.prepare();
}
@@ -414,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare()
}
// Free resources
- if(!_reshaped_weights_t.is_used())
+ if (!_reshaped_weights_t.is_used())
{
_reshaped_weights_t.allocator()->free();
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 3be09581bd..8bad198658 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -30,683 +31,103 @@
#include "arm_compute/core/Log.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-
-namespace
-{
-inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
-{
- switch(kernel_type)
- {
- case CLGEMMKernelType::NATIVE:
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- {
- return true;
- }
- default:
- {
- return false;
- }
- }
-}
-//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
-{
- auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
- if(bool(gemm_kernel))
- {
- if(validate_gemm_kernel(gemm_kernel.gemm_type))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
- return gemm_kernel.gemm_type;
- }
- }
- gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
- return gemm_kernel.gemm_type;
-}
-// Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
-{
- // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
- TensorInfo mm_result_s32_info{};
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
- // Validate mm kernel
- // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
- // NOTE: This assumes:
- // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
- // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
- if(!bool(CLGEMMLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
- {
- return false;
- }
- return true;
-}
+#include "arm_compute/runtime/IMemoryManager.h"
-// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
-{
- auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
- if(config)
- {
- if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
- }
- }
- config = auto_heuristics::select_default_gemm_config_native(query);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
-}
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
-// Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
- unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+namespace arm_compute
{
- // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
- TensorInfo tmp_b_info{};
- // Validate reshape RHS kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- if(!bool(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
- {
- return false;
- }
- // Validate mm kernel
- // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
- // NOTE: This assumes:
- // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
- // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
- GEMMKernelInfo gemm_kernel_info;
- gemm_kernel_info.m = m;
- gemm_kernel_info.n = n;
- gemm_kernel_info.k = k;
- gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
- gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- gemm_kernel_info.lhs_info = lhs_info;
- gemm_kernel_info.rhs_info = rhs_info;
- // Since we ignore the output stage, output data type has to be S32 to pass the validation
- TensorInfo output_info_copy(*output);
- output_info_copy.set_data_type(DataType::S32);
- if(!bool(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
- {
- return false;
- }
- return true;
-}
+using namespace arm_compute::experimental;
+using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore;
-// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
- const ITensorInfo *a,
- const ITensorInfo *b, const ITensorInfo *output)
+struct CLGEMMLowpMatrixMultiplyCore::Impl
{
- auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
- if(config)
- {
- if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
- }
- }
- config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
-}
-
-inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
-{
- switch(kernel_type)
- {
- case CLGEMMKernelType::NATIVE:
- return false;
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- return true;
- default:
- ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
- }
-}
-} // namespace
+ const ICLTensor *b{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<CLTensor> workspace_tensors{};
+ bool is_prepared{false};
+};
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)),
- _weights_to_qasymm8(std::make_unique<opencl::kernels::ClCastKernel>()),
- _mm_native_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
- _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
- _mtx_b_reshape_kernel(std::make_unique<opencl::kernels::ClGemmReshapeRhsMatrixKernel>()),
- _mtx_a_reduction_kernel(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _mtx_b_reduction_kernel(std::make_unique<CLGEMMLowpMatrixBReductionKernel>()),
- _offset_contribution_kernel(std::make_unique<CLGEMMLowpOffsetContributionKernel>()),
- _offset_contribution_output_stage_kernel(std::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()),
- _qasymm8_weights(),
- _vector_sum_col(),
- _vector_sum_row(),
- _tmp_b(),
- _mm_result_s32(),
- _gemm_output_stage_multipliers(),
- _gemm_output_stage_shifts(),
- _matrix_a(nullptr),
- _original_b(nullptr),
- _output(nullptr),
- _a_offset(0),
- _b_offset(0),
- _is_gemm_reshaped(true),
- _reshape_b_only_on_first_run(false),
- _is_prepared(false),
- _run_output_stage(false),
- _convert_to_qasymm8(false),
- _run_offset_contribution(false)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(memory_manager);
}
CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+ const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
{
configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
}
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+ const ICLTensor *a,
+ const ICLTensor *b,
+ const ICLTensor *c,
+ ICLTensor *output,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- _is_prepared = false;
- _original_b = b;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _a_offset = a->info()->quantization_info().uniform().offset;
- _matrix_a = a;
- _output = output;
-
- _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
- && a->info()->data_type() == DataType::QASYMM8;
- _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Set the target for the kernels
- _mm_native_kernel->set_target(gpu_target);
- _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
-
- GEMMRHSMatrixInfo rhs_info;
- GEMMLHSMatrixInfo lhs_info;
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const unsigned int n = b->info()->dimension(0);
- const unsigned int k = a->info()->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
- // Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->info()->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
- if(_convert_to_qasymm8)
- {
- // Set data type for converted weights
- TensorInfo weights_info(*b->info());
- weights_info.set_data_type(DataType::QASYMM8);
- _qasymm8_weights.allocator()->init(weights_info);
- _weights_to_qasymm8->configure(compile_context, b->info(), _qasymm8_weights.info(), ConvertPolicy::WRAP);
- }
-
- const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
- if(_is_gemm_reshaped)
- {
- matrix_b = &_tmp_b;
-
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
+ _impl->b = b;
+ _impl->op = std::make_unique<OperatorType>();
+ _impl->is_prepared = gemm_info.retain_internal_weights();
- // Pick up the GEMM configuration
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
- depth_output_gemm3d,
- a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), output->info());
+ _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+ gemm_info);
+ _impl->aux_mem_req = _impl->op->workspace();
- // Configure reshape RHS kernel
- _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), _tmp_b.info(), rhs_info);
- }
-
- // Using default reduction info
- const GEMMLowpReductionKernelInfo reduction_info {};
-
- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
+ // Manage/allocate auxilairy tensors
+ if (_impl->is_prepared)
{
- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
- _vector_sum_col.allocator()->init(info_vector_sum_col);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_vector_sum_col);
- }
-
- // Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
- }
-
- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
- _vector_sum_row.allocator()->init(info_vector_sum_row);
- _memory_group.manage(&_vector_sum_row);
-
- // Configure matrix A reduction kernel
- _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
- }
-
- GEMMKernelInfo gemm_kernel_info;
- gemm_kernel_info.m = m;
- gemm_kernel_info.n = n;
- gemm_kernel_info.k = k;
- gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
- gemm_kernel_info.lhs_info = lhs_info;
- gemm_kernel_info.rhs_info = rhs_info;
- gemm_kernel_info.a_offset = _a_offset;
- gemm_kernel_info.b_offset = _b_offset;
- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- // Configure offset contribution kernel
- const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
- _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
- _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
- GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
- gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
- if(num_filters == 1)
- {
- // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
- // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
- gemmlowp_output_stage.is_quantized_per_channel = false;
- }
-
- gemm_kernel_info.output_stage = gemmlowp_output_stage;
-
- if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- // Configure and tune matrix multiply kernel with fused output stage
- _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
- }
- else
- {
- _run_output_stage = true;
-
- _memory_group.manage(&_mm_result_s32);
-
- if(_is_gemm_reshaped)
- {
- _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
- }
- else
- {
- // Pick up the GEMM configuration
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
- _matrix_a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : matrix_b->info(), reshape_info);
-
- // Configure matrix multiply kernel
- _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
-
- _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
- a->info()->dimension(0),
- _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
- _mm_result_s32.allocator()->allocate();
- }
- }
-
- _gemm_output_stage_multipliers.allocator()->allocate();
- _gemm_output_stage_shifts.allocator()->allocate();
- // Compute GEMM output multipliers and shifts for output stage
- _gemm_output_stage_multipliers.map();
- _gemm_output_stage_shifts.map();
- std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
- std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
- _gemm_output_stage_multipliers.unmap();
- _gemm_output_stage_shifts.unmap();
+ _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
+ _impl->run_pack.add_tensor(ACL_DST, output);
}
else
{
- _run_offset_contribution = true;
- if(_is_gemm_reshaped)
- {
- // Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
- }
- else
- {
- // Pick up the GEMM configuration
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
- a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), reshape_info);
-
- // Configure matrix multiply kernel
- _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, reshape_info);
- }
-
- // Configure offset contribution kernel
- _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
- _b_offset);
- }
-
- // Allocate tensors
- if(_is_gemm_reshaped)
- {
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- }
-
- if(_b_offset != 0)
- {
- _vector_sum_row.allocator()->allocate();
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
}
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- int32_t a_offset = a->quantization_info().uniform().offset;
- int32_t b_offset = b->quantization_info().uniform().offset;
-
- const ITensorInfo *matrix_a_info = a;
-
- TensorInfo tmp_b_info{};
- GEMMRHSMatrixInfo rhs_info;
- GEMMLHSMatrixInfo lhs_info;
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
-
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
- bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
- && is_data_type_quantized_asymmetric(a->data_type());
- TensorInfo weights_info(*b);
- if(convert_to_qasymm8)
- {
- b_offset = -128;
- weights_info.set_data_type(DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
- }
- const ITensorInfo *matrix_b_info = &weights_info;
- if(reshape_matrix_b)
- {
- matrix_b_info = &tmp_b_info;
-
- // Pick up the GEMM configuration
- // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
- lhs_info = res.lhs_info;
- rhs_info = res.rhs_info;
-
- // Validate reshape RHS kernel
- auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
- }
-
- TensorInfo info_vector_sum_col{};
- TensorInfo info_vector_sum_row{};
-
- const GEMMLowpReductionKernelInfo reduction_info;
- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
- {
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
-
- // Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
- }
-
- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
- {
- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
- // Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
- }
-
- GEMMKernelInfo gemm_kernel_info;
- gemm_kernel_info.m = m;
- gemm_kernel_info.n = n;
- gemm_kernel_info.k = k;
- gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
- gemm_kernel_info.lhs_info = lhs_info;
- gemm_kernel_info.rhs_info = rhs_info;
- gemm_kernel_info.a_offset = a_offset;
- gemm_kernel_info.b_offset = b_offset;
- if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
- const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
- GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
- gemmlowp_output_stage.output_data_type = a->data_type();
-
- gemm_kernel_info.output_stage = gemmlowp_output_stage;
- if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- &gemm_output_stage_multipliers_shifts_info,
- &gemm_output_stage_multipliers_shifts_info));
- }
- else
- {
- TensorInfo mm_result_s32_info{};
-
- if(reshape_matrix_b)
- {
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
- }
- else
- {
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
-
- // Pick up the GEMM configuration
- // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
- lhs_info = res.lhs_info;
- rhs_info = res.rhs_info;
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
- }
-
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- output,
- a_offset, b_offset,
- gemmlowp_output_stage,
- &gemm_output_stage_multipliers_shifts_info,
- &gemm_output_stage_multipliers_shifts_info));
- }
- }
- else
- {
- if(reshape_matrix_b)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
- }
- else
- {
- // Pick up the GEMM configuration
- // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
- const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
- lhs_info = res.lhs_info;
- rhs_info = res.rhs_info;
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
- }
-
- if(output->total_size() != 0)
- {
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- a_offset, b_offset));
- }
- }
-
- return Status{};
+ return OperatorType::validate(a, b, c, output, gemm_info);
}
void CLGEMMLowpMatrixMultiplyCore::run()
{
prepare();
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_is_gemm_reshaped)
- {
- if(!_reshape_b_only_on_first_run)
- {
- // Run reshape matrix B
- ITensorPack mtx_b_pack;
- mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b);
- mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b);
- CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
- }
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
- }
-
- // Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false);
- }
-
- // Run matrix multiply
- if(_is_gemm_reshaped)
- {
- CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false);
- }
- else
- {
- CLScheduler::get().enqueue(*_mm_native_kernel, false);
- }
- if(_run_output_stage)
- {
- // Run offset contribution/output stage kernel
- CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true);
- }
- if(_run_offset_contribution)
- {
- // Run offset contribution kernel
- CLScheduler::get().enqueue(*_offset_contribution_kernel, true);
- }
+ _impl->op->run(_impl->run_pack);
}
void CLGEMMLowpMatrixMultiplyCore::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(_convert_to_qasymm8)
- {
- _qasymm8_weights.allocator()->allocate();
- ITensorPack convert_to_qs8_pack = { { ACL_SRC, _original_b }, { ACL_DST, &_qasymm8_weights } };
- CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
- }
-
- if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
- // Run reshape kernel and mark original weights tensor as unused
- _tmp_b.allocator()->allocate();
- ITensorPack mtx_b_pack;
- mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b);
- mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b);
- CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
- _original_b->mark_as_unused();
- }
+ _impl->op->prepare(_impl->run_pack);
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && _reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
- }
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors);
- CLScheduler::get().queue().finish();
- _is_prepared = true;
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index be452aaf3d..3dd8c5f101 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,166 +23,73 @@
*/
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
#include <algorithm>
namespace arm_compute
{
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_multiplier = result_fixedpoint_multiplier;
- info.gemmlowp_shift = result_shift;
- info.gemmlowp_offset = result_offset_after_shift;
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QASYMM8;
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QASYMM8;
- return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min, int max)
+struct CLGEMMLowpOutputStage::Impl
{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_multiplier = result_fixedpoint_multiplier;
- info.gemmlowp_shift = result_shift;
- info.gemmlowp_offset = result_offset_after_shift;
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QASYMM8_SIGNED;
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
+ const ICLTensor *src{nullptr};
+ const ICLTensor *bias{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
+ ITensorPack run_pack{};
+};
+
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QASYMM8_SIGNED;
- return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift,
- int min, int max)
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default;
+
+void CLGEMMLowpOutputStage::configure(const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+ configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
}
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
- int result_fixedpoint_multiplier, int result_shift,
- int min, int max)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *bias,
+ ICLTensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_multiplier = result_fixedpoint_multiplier;
- info.gemmlowp_shift = result_shift;
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QSYMM16;
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
-}
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
- int min, int max)
-{
- GEMMLowpOutputStageInfo info{};
- info.gemmlowp_min_bound = min;
- info.gemmlowp_max_bound = max;
- info.output_data_type = DataType::QSYMM16;
- return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
-}
+ _impl->src = input;
+ _impl->bias = bias;
+ _impl->dst = output;
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
+ _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
+ _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+ info);
+ _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
}
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- {
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
- break;
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- {
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
- break;
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
- {
- auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
- k->configure(compile_context, input, bias, output, &info);
- _kernel = std::move(k);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
- }
+ return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
}
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::run()
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
- return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
- }
+ _impl->op->run(_impl->run_pack);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index bde34dc4db..2610cb1a3b 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLGather.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLGatherKernel.h"
namespace arm_compute
@@ -33,8 +35,13 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
}
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ int axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
auto k = std::make_unique<CLGatherKernel>();
k->configure(compile_context, input, indices, output, axis);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 81e24dba08..b2c1d2631e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -27,6 +27,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
#include "src/core/CL/kernels/CLPadLayerKernel.h"
@@ -69,47 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
const GenerateProposalsInfo &info)
{
- configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+ configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+ num_valid_proposals, info);
}
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
- ICLTensor *scores_out,
- ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *scores,
+ const ICLTensor *deltas,
+ const ICLTensor *anchors,
+ ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+ proposals->info(), scores_out->info(),
+ num_valid_proposals->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
_is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType scores_data_type = scores->info()->data_type();
_is_qasymm8 = scores_data_type == DataType::QASYMM8;
- const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
- const int total_num_anchors = num_anchors * feat_width * feat_height;
- const int pre_nms_topN = info.pre_nms_topN();
- const int post_nms_topN = info.post_nms_topN();
- const size_t values_per_roi = info.values_per_roi();
+ const int num_anchors = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
- const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+ const QuantizationInfo rois_qinfo =
+ (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
// Compute all the anchors
_memory_group.manage(&_all_anchors);
- _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+ _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+ ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
- _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+ _deltas_flattened.allocator()->init(
+ TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
// Permute and reshape deltas
_memory_group.manage(&_deltas_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
_flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
@@ -123,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
// Permute and reshape scores
_memory_group.manage(&_scores_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
_flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
@@ -137,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
CLTensor *anchors_to_use = &_all_anchors;
CLTensor *deltas_to_use = &_deltas_flattened;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
_deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -160,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
anchors_to_use->allocator()->allocate();
_all_proposals_to_use = &_all_proposals;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_memory_group.manage(&_all_proposals_quantized);
// Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
- _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _all_proposals_quantized.allocator()->init(
+ TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
_quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
_all_proposals.allocator()->allocate();
_all_proposals_to_use = &_all_proposals_quantized;
@@ -180,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
// Note that NMS needs outputs preinitialized.
auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
- auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+ rois_qinfo);
auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
// Initialize temporaries (unused) outputs
@@ -192,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
_num_valid_proposals = num_valid_proposals;
_memory_group.manage(&_proposals_4_roi_values);
- _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
- BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+ _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+ &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+ BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+ true, min_size_scaled, info.im_width(), info.im_height()));
_keeps_nms_unused.allocator()->allocate();
_classes_nms_unused.allocator()->allocate();
_all_proposals_to_use->allocator()->allocate();
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
_proposals_4_roi_values.allocator()->allocate();
}
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
- const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -213,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
- const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+ const int num_anchors =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -224,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
}
- TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
- TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- if(scores->data_layout() == DataLayout::NHWC)
+ TensorInfo all_anchors_info(
+ anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+ anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info =
+ deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+ .set_is_resizable(true);
+ TensorInfo scores_permuted_info =
+ scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if (scores->data_layout() == DataLayout::NHWC)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
}
- TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo deltas_flattened_info(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
- TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo scores_flattened_info(
+ scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
- TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
- if(is_qasymm8)
+ TensorInfo proposals_4_roi_values_quantized(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+ .set_quantization_info(QuantizationInfo(0.125f, 0));
+ if (is_qasymm8)
{
- TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ TensorInfo all_anchors_f32_info(anchors->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
- TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
- TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ TensorInfo deltas_flattened_f32_info(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+ &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
- if(num_valid_proposals->total_size() > 0)
+ if (num_valid_proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
}
- if(proposals->total_size() > 0)
+ if (proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -306,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
}
}
- if(scores_out->total_size() > 0)
+ if (scores_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -353,7 +412,7 @@ void CLGenerateProposalsLayer::run()
CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
// Transpose and reshape the inputs
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_permute_deltas.run();
_permute_scores.run();
@@ -361,7 +420,7 @@ void CLGenerateProposalsLayer::run()
_flatten_deltas.run();
_flatten_scores.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_dequantize_anchors->run();
_dequantize_deltas->run();
@@ -370,7 +429,7 @@ void CLGenerateProposalsLayer::run()
// Build the boxes
CLScheduler::get().enqueue(*_bounding_box_kernel, false);
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_quantize_all_proposals->run();
}
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..1a2369c5c2
--- /dev/null
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+
+namespace arm_compute
+{
+struct CLIndirectConvolutionLayer::Impl
+{
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
+};
+
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer() = default;
+
+void CLIndirectConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
+
+ _impl->src = input;
+ _impl->weights = weights;
+ _impl->biases = biases;
+ _impl->dst = output;
+ _impl->op = std::make_unique<opencl::ClIndirectConv2d>();
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+}
+
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
+}
+
+void CLIndirectConvolutionLayer::run()
+{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+ _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 4a0bda8255..0e994e1aee 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -27,6 +27,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
@@ -34,40 +36,53 @@
namespace arm_compute
{
CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
- : _inst_norm_kernel(),
- _mean_var_kernel(),
- _mean_var_tensor(),
- _ctx(ctx)
+ : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
{
}
CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
{
}
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+ ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
}
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
auto w = std::make_unique<CLComputeMeanVariance>();
w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
_mean_var_kernel = std::move(w);
auto k = std::make_unique<CLInstanceNormalizationLayerKernel>();
- k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+ k->configure(compile_context, input, &_mean_var_tensor, output,
+ InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
_inst_norm_kernel = std::move(k);
_mean_var_tensor.allocator()->allocate();
}
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision)
{
- return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+ return CLInstanceNormalizationLayerKernel::validate(
+ input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
}
void CLInstanceNormalizationLayer::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+ ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+ "The child class didn't set the CL kernel or function isn't configured");
schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 8c360aaa9e..4fe1d9b20b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
@@ -55,8 +57,11 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
}
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+ const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
+
// Reset auxiliary tensor
_sumsq.allocator()->init(TensorInfo());
@@ -82,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
sum_sq.set_tensor_shape(shape);
const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
// Reduce shape on axis
shape.set(actual_axis, 1);
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 85d13c246e..3b50234c77 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -24,18 +24,15 @@
#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
namespace arm_compute
{
@@ -43,51 +40,156 @@ using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::utils::info_helpers;
CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
- _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
- _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
- _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
- _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
- _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
- _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
- _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
- _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
- _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
- _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
- _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+ : _memory_group(std::move(memory_manager)),
+ _fully_connected_input_gate(),
+ _accum_input_gate1(),
+ _subtract_input_gate(),
+ _pixelwise_mul_input_gate(),
+ _activation_input_gate(),
+ _fully_connected_forget_gate(),
+ _accum_forget_gate1(),
+ _pixelwise_mul_forget_gate(),
+ _activation_forget_gate(),
+ _fully_connected_cell_state(),
+ _gemm_cell_state1(),
+ _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+ _accum_cell_state1(),
+ _accum_cell_state2(),
+ _pixelwise_mul_cell_state1(),
+ _activation_cell_state(),
+ _cell_clip(),
+ _pixelwise_mul_cell_state2(),
+ _fully_connected_output(),
+ _pixelwise_mul_output_state1(),
+ _accum_output1(),
+ _activation_output(),
+ _activation_output_state(),
+ _pixelwise_mul_output_state2(),
+ _fully_connected_output_state(),
+ _projection_clip(),
+ _copy_cell_state(),
+ _copy_output(),
+ _concat_scratch_buffer(),
+ _concat_inputs_forget_gate(),
+ _concat_weights_forget_gate(),
+ _concat_weights_input_gate(),
+ _concat_weights_output(),
+ _ones_fill(),
+ _mean_std_norm_input_gate(),
+ _pixelwise_mul_input_gate_coeff(),
+ _accum_input_gate_bias(),
+ _mean_std_norm_forget_gate(),
+ _pixelwise_mul_forget_gate_coeff(),
+ _accum_forget_gate_bias(),
+ _mean_std_norm_cell_gate(),
+ _pixelwise_mul_cell_gate_coeff(),
+ _accum_cell_gate_bias(),
+ _mean_std_norm_output_gate(),
+ _pixelwise_mul_output_gate_coeff(),
+ _accum_output_gate_bias(),
+ _input_gate_out1(),
+ _input_gate_out2(),
+ _input_gate_out3(),
+ _input_gate_out4(),
+ _forget_gate_out1(),
+ _forget_gate_out2(),
+ _forget_gate_out3(),
+ _forget_gate_out4(),
+ _forget_gate_out5(),
+ _forget_gate_out6(),
+ _cell_state_out1(),
+ _cell_state_out2(),
+ _cell_state_out3(),
+ _cell_state_out4(),
+ _cell_state_out5(),
+ _output1(),
+ _output2(),
+ _output3(),
+ _output4(),
+ _cell_state_activation(),
+ _output_state1(),
+ _ones(),
+ _input_layer_norm_out1(),
+ _input_layer_norm_out2(),
+ _forget_layer_norm_out1(),
+ _forget_layer_norm_out2(),
+ _cell_layer_norm_out1(),
+ _cell_layer_norm_out2(),
+ _output_layer_norm_out1(),
+ _output_layer_norm_out2(),
+ _run_peephole_opt(false),
+ _run_cifg_opt(false),
+ _perform_cell_clipping(false),
+ _has_projection_weights(false),
+ _perform_projection_clipping(false),
+ _is_prepared(false),
+ _is_layer_norm_lstm(false)
{
}
CLLSTMLayer::~CLLSTMLayer() = default;
-void CLLSTMLayer::configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+ cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
cell_threshold, projection_threshold);
}
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- const ICLTensor *output_state_in, ICLTensor *cell_state_in,
- ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
- const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer,
+ ICLTensor *output_state_out,
+ ICLTensor *cell_state_out,
+ ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ cell_threshold, projection_threshold);
+
_is_layer_norm_lstm = lstm_params.use_layer_norm();
// Set lstm parameters
@@ -95,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
- input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- output_state_in->info(), cell_state_in->info(),
- scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
- lstm_params_info, activation_info, cell_threshold, projection_threshold));
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+ cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+ lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
// Configure block that calculates the forget gate
@@ -125,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
weights_vector.emplace_back(input_to_forget_weights);
weights_vector.emplace_back(recurrent_to_forget_weights);
- const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+ const TensorShape weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
_forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+ (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
CLTensor *forget_gate_out = &_forget_gate_out5;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
- _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+ ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -153,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
{
_forget_gate_out3.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_layer_norm_out1);
_memory_group.manage(&_forget_layer_norm_out2);
_mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
- _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+ lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
forget_gate_out->allocator()->allocate();
- _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+ &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
_forget_layer_norm_out1.allocator()->allocate();
forget_gate_out = &_forget_layer_norm_out2;
}
- _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -177,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
// input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
CLTensor *input_gate_out = &_input_gate_out1;
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
- _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -194,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
std::vector<const ICLTensor *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
_input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -202,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out3);
- _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+ (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+ &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out4);
- _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+ &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -220,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_input_gate_out1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_input_layer_norm_out1);
_memory_group.manage(&_input_layer_norm_out2);
_mean_std_norm_input_gate.configure(compile_context, input_gate_out);
- _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+ lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
input_gate_out->allocator()->allocate();
- _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+ &_input_layer_norm_out2, ConvertPolicy::SATURATE);
_input_layer_norm_out1.allocator()->allocate();
input_gate_out = &_input_layer_norm_out2;
}
- _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -248,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+ (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
_transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
_recurrent_to_cell_weights = recurrent_to_cell_weights;
_memory_group.manage(&_cell_state_out3);
- _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+ 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+ ConvertPolicy::SATURATE);
CLTensor *cell_state_out_ptr = &_cell_state_out4;
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_layer_norm_out1);
_memory_group.manage(&_cell_layer_norm_out2);
_mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
- _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+ lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
cell_state_out_ptr->allocator()->allocate();
- _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_cell_layer_norm_out1.allocator()->allocate();
cell_state_out_ptr = &_cell_layer_norm_out2;
}
_activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
cell_state_out_ptr->allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+ ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
- if(cell_threshold != 0.f)
+ if (cell_threshold != 0.f)
{
_perform_cell_clipping = true;
- _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold));
}
// Configure block that calculates the output
@@ -297,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
std::vector<const ICLTensor *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
_output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -305,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+ _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+ (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
CLTensor *output_gate_out = &_output4;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
_memory_group.manage(&_output3);
- _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+ &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -328,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
{
_output1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_output_layer_norm_out1);
_memory_group.manage(&_output_layer_norm_out2);
_mean_std_norm_output_gate.configure(compile_context, output_gate_out);
- _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+ lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
output_gate_out->allocator()->allocate();
- _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+ &_output_layer_norm_out2, ConvertPolicy::SATURATE);
_output_layer_norm_out1.allocator()->allocate();
output_gate_out = &_output_layer_norm_out2;
}
- _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_output.configure(compile_context, output_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -360,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
_memory_group.manage(&_cell_state_activation);
_activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
- _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+ output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_activation.allocator()->allocate();
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
_has_projection_weights = true;
- _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out);
_output_state1.allocator()->allocate();
// Perform clipping
- if(projection_threshold != 0.f)
+ if (projection_threshold != 0.f)
{
_perform_projection_clipping = true;
- _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ _projection_clip.configure(compile_context, output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -projection_threshold, projection_threshold));
}
}
@@ -382,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
// Vector for holding the tensors to store in scratch buffer
std::vector<const ICLTensor *> scratch_inputs;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
scratch_inputs.emplace_back(input_gate_out);
}
@@ -396,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
output_gate_out->allocator()->allocate();
}
-Status CLLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check data types
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check dimensions
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -437,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
- && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+ cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
const unsigned int num_batches = input->dimension(1);
const unsigned int num_cells = input_to_output_weights->dimension(1);
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
// If CIFG is used, input layer normalization weights tensor is omitted
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
}
@@ -458,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
}
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -469,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
}
// Check peephole optimization
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -487,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
// Validate forget gate
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate input gate
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
- lstm_params.recurrent_to_input_weights(),
- lstm_params.input_gate_bias());
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -524,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
- TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, lstm_params.input_to_input_weights(),
+ (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+ &input_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(lstm_params.use_layer_norm())
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(cell_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (cell_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
- cell_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&cell_state_tmp, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold)));
}
std::vector<const ITensorInfo *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
- TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
// Validate output gate tmp
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+ input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_NEAREST_EVEN));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
- if(lstm_params.has_projection())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
- if(projection_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ if (lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out));
+ if (projection_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ output_state_out, output_state_out,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
}
}
@@ -615,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
// Validate scratch concatenation
std::vector<const ITensorInfo *> inputs_vector_info_raw;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
inputs_vector_info_raw.push_back(&input_gate);
}
@@ -637,12 +829,12 @@ void CLLSTMLayer::run()
_fully_connected_forget_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_forget_gate.run();
_accum_forget_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_forget_gate.run();
_pixelwise_mul_forget_gate_coeff.run();
@@ -650,7 +842,7 @@ void CLLSTMLayer::run()
}
_activation_forget_gate.run();
- if(_run_cifg_opt)
+ if (_run_cifg_opt)
{
_ones_fill.run();
_subtract_input_gate.run();
@@ -659,13 +851,13 @@ void CLLSTMLayer::run()
{
_fully_connected_input_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_input_gate.run();
_accum_input_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_input_gate.run();
_pixelwise_mul_input_gate_coeff.run();
@@ -678,12 +870,10 @@ void CLLSTMLayer::run()
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
- CLScheduler::get().enqueue_op(*_transpose_cell_state,
- pack,
- false);
+ CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
_gemm_cell_state1.run();
_accum_cell_state1.run();
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_cell_gate.run();
_pixelwise_mul_cell_gate_coeff.run();
@@ -694,19 +884,19 @@ void CLLSTMLayer::run()
_pixelwise_mul_cell_state2.run();
_accum_cell_state2.run();
- if(_perform_cell_clipping)
+ if (_perform_cell_clipping)
{
_cell_clip.run();
}
_fully_connected_output.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_output_state1.run();
_accum_output1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_output_gate.run();
_pixelwise_mul_output_gate_coeff.run();
@@ -717,10 +907,10 @@ void CLLSTMLayer::run()
_activation_output_state.run();
_pixelwise_mul_output_state2.run();
- if(_has_projection_weights)
+ if (_has_projection_weights)
{
_fully_connected_output_state.run();
- if(_perform_projection_clipping)
+ if (_perform_projection_clipping)
{
_projection_clip.run();
}
@@ -734,10 +924,10 @@ void CLLSTMLayer::run()
void CLLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_concat_weights_forget_gate.run();
- if(!_run_cifg_opt)
+ if (!_run_cifg_opt)
{
_concat_weights_input_gate.run();
}
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index a44dcd2e24..ea64eda023 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -25,14 +25,11 @@
#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <memory>
@@ -49,44 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit
} // namespace
CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
- _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
- _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
- _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
- _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
- _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
- _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
- _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+ : _memory_group(std::move(memory_manager)),
+ _gemmlowp(),
+ _output_stage(),
+ _transpose_weights(),
+ _concat_input_weights(),
+ _concat_recurrent_weights(),
+ _concat_weights(),
+ _concat_inputs(),
+ _concat_bias(),
+ _sigmoid_forget_gate(),
+ _sigmoid_input_gate(),
+ _sigmoid_output_gate(),
+ _tanh_modulation_gate(),
+ _tanh_output_state(),
+ _add_cell_state_tmps(),
+ _add2(),
+ _mul_forget_gate_cell_state(),
+ _mul_input_gate_input_mod_gate(),
+ _mul_output_state_tmp_output_gate(),
+ _slice_input_tensor(),
+ _slice_forget_tensor(),
+ _slice_cell_tensor(),
+ _slice_output_tensor(),
+ _dequantize(),
+ _quantize(),
+ _input_to_input_weights(nullptr),
+ _input_to_forget_weights(nullptr),
+ _input_to_cell_weights(nullptr),
+ _input_to_output_weights(nullptr),
+ _recurrent_to_input_weights(nullptr),
+ _recurrent_to_forget_weights(nullptr),
+ _recurrent_to_cell_weights(nullptr),
+ _recurrent_to_output_weights(nullptr),
+ _input_gate_bias(nullptr),
+ _forget_gate_bias(nullptr),
+ _cell_bias(nullptr),
+ _output_gate_bias(nullptr),
+ _recurrent_weights(),
+ _input_weights(),
+ _weights(),
+ _input(),
+ _weights_transposed(),
+ _output_highp(),
+ _output_lowp(),
+ _bias(),
+ _forget_gate_input(),
+ _input_gate_input(),
+ _output_gate_input(),
+ _input_modulation_gate_input(),
+ _forget_gate_output(),
+ _input_gate_output(),
+ _output_gate_output(),
+ _input_modulation_gate_output(),
+ _cell_state_tmp1(),
+ _cell_state_tmp2(),
+ _output_state_tmp(),
+ _output_state_out_symm(),
+ _output_state_out_f32(),
+ _is_prepared(false)
{
}
void CLLSTMLayerQuantized::configure(const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out)
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
- output_state_out);
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
}
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, const ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_input_weights,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
- ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
- input_to_output_weights->info(),
- recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+ cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
+
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+ input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+ recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info()));
const int input_size = input->info()->dimension(0);
const int batch_size = input->info()->dimension(1);
@@ -94,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
- auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
- auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+ auto_init_if_empty(*cell_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+ auto_init_if_empty(*output_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
_input_to_input_weights = input_to_input_weights;
_input_to_forget_weights = input_to_forget_weights;
@@ -123,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
- _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _input_weights.allocator()->init(
+ TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
- _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _recurrent_weights.allocator()->init(
+ TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
std::vector<const ICLTensor *> weights_vector;
weights_vector.emplace_back(&_recurrent_weights);
weights_vector.emplace_back(&_input_weights);
- _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _weights.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
_transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
@@ -143,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
input_vector.emplace_back(output_state_in);
_memory_group.manage(&_input);
- _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+ _input.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
_concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
// Bias concatenation
@@ -158,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
// Invert the offset for gemmlowp
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
// Run gemmlowp
_memory_group.manage(&_output_highp);
@@ -168,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
// Set the offset back
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
// multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
_output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -179,90 +269,122 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
- _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+ GEMMLowpOutputStageInfo info{};
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info);
_output_highp.allocator()->allocate();
_bias.allocator()->allocate();
// Get the gate tensors
- if(batch_size > 1)
+ if (batch_size > 1)
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+ {output_size, batch_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+ {2 * output_size, batch_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+ {2 * output_size, 0}, {3 * output_size, batch_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+ {4 * output_size, batch_size});
_output_lowp.allocator()->allocate();
}
else
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+ {2 * output_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+ {3 * output_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+ {4 * output_size});
_output_lowp.allocator()->allocate();
}
// Forget gate
_memory_group.manage(&_forget_gate_output);
- _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_output.allocator()->init(
+ TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_forget_gate_input.allocator()->allocate();
// Input gate
_memory_group.manage(&_input_gate_output);
- _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_output.allocator()->init(
+ TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_input_gate_input.allocator()->allocate();
// Input modulation gate equation
_memory_group.manage(&_input_modulation_gate_output);
- _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _input_modulation_gate_output.allocator()->init(
+ TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_input_modulation_gate_input.allocator()->allocate();
// Output gate
_memory_group.manage(&_output_gate_output);
- _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_output.allocator()->init(
+ TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_output_gate_input.allocator()->allocate();
// Long term memory
_memory_group.manage(&_cell_state_tmp1);
- _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state_tmp1.allocator()->init(
+ TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_forget_gate_output.allocator()->allocate();
_memory_group.manage(&_cell_state_tmp2);
- _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state_tmp2.allocator()->init(
+ TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+ &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_input_modulation_gate_output.allocator()->allocate();
_input_gate_output.allocator()->allocate();
- _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+ _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+ ConvertPolicy::SATURATE);
_cell_state_tmp1.allocator()->allocate();
_cell_state_tmp2.allocator()->allocate();
// Short term memory
_memory_group.manage(&_output_state_tmp);
- _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _output_state_tmp.allocator()->init(
+ TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_memory_group.manage(&_output_state_out_symm);
- _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_state_out_symm.allocator()->init(
+ TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+ &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate_output.allocator()->allocate();
_output_state_tmp.allocator()->allocate();
// Requantize the output state from QSYMM16 to QASYMM8
_memory_group.manage(&_output_state_out_f32);
- _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+ _output_state_out_f32.allocator()->init(
+ TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
_dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
_output_state_out_symm.allocator()->allocate();
@@ -271,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
}
Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
- output_state_in, cell_state_out, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
const int input_size = input->dimension(0);
@@ -292,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
- TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
- TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
- TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+ TensorInfo input_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(input_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(output_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo bias_info(
+ input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+ TensorInfo output_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QASYMM8)
+ .set_quantization_info(qasymm));
+ TensorInfo cell_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QSYMM16)
+ .set_quantization_info(qsymm_4));
// Shape checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
// Data type checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
// Quantization checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
@@ -336,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
- ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
// _concat_weights
std::vector<const ITensorInfo *> weights_vector;
@@ -346,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
// _transpose_weights
const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
- TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+ TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
// _concat_inputs
@@ -372,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
// _gemmlowp
const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
// Set the offset back
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -383,78 +542,107 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
// _output_stage
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+ GEMMLowpOutputStageInfo info{};
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
TensorInfo input_gate_input;
TensorInfo forget_gate_input;
TensorInfo input_modulation_gate_input;
TensorInfo output_gate_input;
- if(batch_size > 1)
+ if (batch_size > 1)
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
}
else
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
}
// _sigmoid_forget_gate
const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _sigmoid_input_gate
const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _tanh_modulation_gate
- const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+ qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _sigmoid_output_gate
const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _mul_forget_gate_cell_state
const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
// _mul_input_gate_input_mod_gate
const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+ &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _add_cell_state_tmps
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
// _tanh_modulation_gate
const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _mul_output_state_tmp_output_gate
const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+ &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _dequantize
const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -463,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
// _quantize
ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
- if(cell_state_out->total_size() != 0)
+ if (cell_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
}
- if(output_state_out->total_size() != 0)
+ if (output_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -529,7 +717,7 @@ void CLLSTMLayerQuantized::run()
void CLLSTMLayerQuantized::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_input_weights.allocator()->allocate();
_concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 98c98abed5..ea21c54bc3 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -22,8 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include <utility>
@@ -31,8 +34,12 @@ namespace arm_compute
{
namespace experimental
{
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
k->configure(compile_context, LogicalOperation::And, input1, input2, output);
_kernel = std::move(k);
@@ -51,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors)
struct CLLogicalAnd::Impl
{
- const ICLTensor *src0{ nullptr };
- const ICLTensor *src1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+ const ICLTensor *src0{nullptr};
+ const ICLTensor *src1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
};
-CLLogicalAnd::CLLogicalAnd()
- : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
CLLogicalAnd::~CLLogicalAnd() = default;
@@ -70,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
_impl->src0 = input1;
_impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index 388d2bce86..71f9cce54f 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -25,23 +25,23 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClLogicalNot.h"
+#include "src/gpu/cl/operators/ClLogicalNot.h"
namespace arm_compute
{
struct CLLogicalNot::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
};
-CLLogicalNot::CLLogicalNot()
- : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
CLLogicalNot::~CLLogicalNot() = default;
@@ -72,4 +72,4 @@ void CLLogicalNot::run()
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index 897963ab50..3db4fdae84 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -22,8 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
#include <utility>
@@ -31,8 +34,12 @@ namespace arm_compute
{
namespace experimental
{
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+ ITensorInfo *input1,
+ ITensorInfo *input2,
+ ITensorInfo *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
k->configure(compile_context, LogicalOperation::Or, input1, input2, output);
_kernel = std::move(k);
@@ -51,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors)
struct CLLogicalOr::Impl
{
- const ICLTensor *src0{ nullptr };
- const ICLTensor *src1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+ const ICLTensor *src0{nullptr};
+ const ICLTensor *src1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
};
-CLLogicalOr::CLLogicalOr()
- : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
{
}
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
CLLogicalOr::~CLLogicalOr() = default;
@@ -70,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
}
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output)
{
_impl->src0 = input1;
_impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
new file mode 100644
index 0000000000..e8bdad706b
--- /dev/null
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/operators/ClMatMul.h"
+
+namespace arm_compute
+{
+using OperatorType = opencl::ClMatMul;
+
+struct CLMatMul::Impl
+{
+ std::unique_ptr<OperatorType> op{nullptr};
+ ITensorPack run_pack{};
+};
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLMatMul::~CLMatMul() = default;
+
+void CLMatMul::configure(ICLTensor *lhs,
+ ICLTensor *rhs,
+ ICLTensor *output,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(settings);
+ configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
+}
+
+void CLMatMul::configure(const CLCompileContext &compile_context,
+ ICLTensor *lhs,
+ ICLTensor *rhs,
+ ICLTensor *output,
+ const MatMulInfo &matmul_info,
+ const GpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
+ ARM_COMPUTE_UNUSED(settings);
+
+ _impl->op = std::make_unique<OperatorType>();
+ _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
+ _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
+}
+
+Status CLMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *output,
+ const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info)
+{
+ return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
+}
+
+void CLMatMul::run()
+{
+ _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 52151cdfe1..7494f379b9 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -27,32 +27,44 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
namespace arm_compute
{
CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
- : _fill(),
- _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+ : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
{
}
CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor *input,
+ ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
}
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *indices,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
const PixelValue zero_value(0.f);
_fill.configure(output, zero_value);
_unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
}
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
}
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 0f6a0e47a4..5892c0e840 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
namespace arm_compute
@@ -33,8 +35,12 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp
configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
}
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
k->configure(compile_context, input, output, epsilon);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 12560f1b02..f93f82f1a2 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -30,6 +30,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
@@ -48,28 +50,35 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const
configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
}
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
// Configure normalization kernel
_norm_kernel->configure(compile_context, input, output, norm_info);
- if(!_norm_kernel->border_size().empty())
+ if (!_norm_kernel->border_size().empty())
{
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+ _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+ PixelValue());
}
}
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
return CLNormalizationLayerKernel::validate(input, output, norm_info);
}
void CLNormalizationLayer::run()
{
- if(!_norm_kernel->border_size().empty())
+ if (!_norm_kernel->border_size().empty())
{
// Run border handler
CLScheduler::get().enqueue(*_border_handler, false);
@@ -78,4 +87,4 @@ void CLNormalizationLayer::run()
// Run normalization kernel
CLScheduler::get().enqueue(*_norm_kernel);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 70189a2cb6..939c95bd45 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,26 +24,37 @@
#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
#include <utility>
namespace arm_compute
{
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
}
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *mean,
+ const ICLTensor *std)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
k->configure(compile_context, input, output, mean, std);
_kernel = std::move(k);
}
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *std)
{
return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
}
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index bb7aff218d..ce6d285ebe 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -22,10 +22,12 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPRelu.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/operators/ClPRelu.h"
namespace arm_compute
{
@@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu;
struct CLPReluLayer::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
};
-CLPReluLayer::CLPReluLayer()
- : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
{
}
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
CLPReluLayer::~CLPReluLayer() = default;
@@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp
configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
}
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *alpha,
+ ICLTensor *output)
{
_impl->src_0 = input;
_impl->src_1 = alpha;
_impl->dst = output;
_impl->op = std::make_unique<OperatorType>();
- _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+ _impl->op->configure(compile_context, input->info(), alpha->info(),
+ (output == nullptr ? input->info() : output->info()));
}
Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index d105c0597c..e788ded512 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,34 +22,38 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLPadLayerKernel.h"
namespace arm_compute
{
-CLPadLayer::CLPadLayer()
- : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
- _copy(),
- _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
{
}
CLPadLayer::~CLPadLayer() = default;
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+ ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
}
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+ ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
- _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
- {
- return info.first > 0 || info.second > 0;
- });
+ _perform_pad =
+ std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
- if(_perform_pad)
+ if (_perform_pad)
{
_pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
}
@@ -59,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
_copy.configure(compile_context, input, output);
}
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ PixelValue constant_value,
+ PaddingMode mode)
{
- bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
- {
- return info.first > 0 || info.second > 0;
- });
+ bool perform_pad =
+ std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
- if(perform_pad)
+ if (perform_pad)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
}
@@ -78,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
}
void CLPadLayer::run()
{
- if(_perform_pad)
+ if (_perform_pad)
{
CLScheduler::get().enqueue(*_pad_kernel);
}
@@ -87,4 +93,4 @@ void CLPadLayer::run()
_copy.run();
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index c1da2a9eca..7f97eed98a 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -27,20 +27,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/operators/ClPermute.h"
namespace arm_compute
{
struct CLPermute::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClPermute> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClPermute> op{nullptr};
};
-CLPermute::CLPermute()
- : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
{
}
@@ -51,9 +52,13 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu
configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
}
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const PermutationVector &perm)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, perm);
_impl->src = input;
_impl->dst = output;
@@ -74,4 +79,4 @@ void CLPermute::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 932659268d..6aa9d9cbb3 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClMul.h"
+#include "src/gpu/cl/operators/ClMul.h"
#include <utility>
@@ -34,38 +35,55 @@ namespace arm_compute
{
struct CLPixelWiseMultiplication::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClMul> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClMul> op{nullptr};
};
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
CLPixelWiseMultiplication::~CLPixelWiseMultiplication() = default;
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+ rounding_policy, act_info);
}
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClMul>();
- _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+ _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+ rounding_policy, act_info);
}
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
@@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run()
struct CLComplexPixelWiseMultiplication::Impl
{
- const ICLTensor *src_0{ nullptr };
- const ICLTensor *src_1{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+ const ICLTensor *src_0{nullptr};
+ const ICLTensor *src_1{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClComplexMul> op{nullptr};
};
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default;
-
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication() = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
}
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context,
+ ICLTensor *input1,
+ ICLTensor *input2,
+ ICLTensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
@@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
_impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
}
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return opencl::ClComplexMul::validate(input1, input2, output, act_info);
}
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
new file mode 100644
index 0000000000..ce1092a7cc
--- /dev/null
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPool3d.h"
+
+namespace arm_compute
+{
+struct CLPooling3dLayer::Impl
+{
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ ICLTensor *indices{nullptr};
+ std::unique_ptr<opencl::ClPool3d> op{nullptr};
+};
+
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLPooling3dLayer::~CLPooling3dLayer() = default;
+
+void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
+}
+
+void CLPooling3dLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Pooling3dLayerInfo &pool_info)
+{
+ _impl->src = input;
+ _impl->dst = output;
+
+ _impl->op = std::make_unique<opencl::ClPool3d>();
+ _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
+}
+
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+ return opencl::ClPool3d::validate(input, output, pool_info);
+}
+
+void CLPooling3dLayer::run()
+{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+ pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+ _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 7ba911c342..65e53b9be3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -25,41 +25,52 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPool2d.h"
+#include "src/gpu/cl/operators/ClPool2d.h"
namespace arm_compute
{
struct CLPoolingLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- ICLTensor *indices{ nullptr };
- std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ ICLTensor *indices{nullptr};
+ std::unique_ptr<opencl::ClPool2d> op{nullptr};
};
-CLPoolingLayer::CLPoolingLayer()
- : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
{
}
CLPoolingLayer::~CLPoolingLayer() = default;
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor *input,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info,
+ ICLTensor *indices)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
}
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const PoolingLayerInfo &pool_info,
+ ICLTensor *indices)
{
_impl->src = input;
_impl->dst = output;
_impl->indices = indices;
_impl->op = std::make_unique<opencl::ClPool2d>();
- _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+ _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+ (indices) ? indices->info() : nullptr);
}
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
return opencl::ClPool2d::validate(input, output, pool_info, indices);
}
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 5ace7c6d7a..cfd0ec4fbf 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,28 +29,40 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
using namespace arm_compute;
-CLPriorBoxLayer::CLPriorBoxLayer()
- : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
{
}
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info)
{
configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
}
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input1,
+ const ICLTensor *input2,
+ ICLTensor *output,
+ const PriorBoxLayerInfo &info)
{
- _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
- _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
- if(!info.max_sizes().empty())
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
+ _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.min_sizes().size() * sizeof(float));
+ _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.aspect_ratios().size() * sizeof(float));
+ if (!info.max_sizes().empty())
{
- _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+ _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ info.max_sizes().size() * sizeof(float));
}
auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -58,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I
_kernel = std::move(k);
}
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
-} \ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index fcf5b9d2a4..12f6f89290 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,30 +26,36 @@
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/QuantizationInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
namespace arm_compute
{
using namespace arm_compute::utils::info_helpers;
+using namespace arm_compute::opencl::kernels;
namespace
{
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
- float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensorInfo *mm_input,
+ const ITensorInfo *mm_weights,
+ const ITensorInfo *bias,
+ float gemmlowp_scale,
+ const TensorInfo *mm_res_info,
+ const TensorInfo *outstage_tensor_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
return Status{};
}
} // namespace
@@ -79,33 +85,31 @@ void CLQLSTMLayer::TensorCopyKernel::run()
_src->map(q, true);
_dst->map(q, true);
- Iterator input_iter{ _src, _window };
- Iterator output_iter{ _dst, _window };
+ Iterator input_iter{_src, _window};
+ Iterator output_iter{_dst, _window};
- execute_window_loop(_window, [&](const Coordinates &)
- {
- memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
- },
- input_iter, output_iter);
+ execute_window_loop(
+ _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+ output_iter);
_src->unmap(q);
_dst->unmap(q);
}
CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _input_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _recurrent_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _input_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _recurrent_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _input_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _recurrent_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _input_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _recurrent_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
- _projection_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+ : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
_layer_norms(),
_copy_output()
{
- for(auto &norm : _layer_norms)
+ for (auto &norm : _layer_norms)
{
norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
}
@@ -130,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
{
// Output quantization scale will be different, but ignored here
// since it will be configured at configure() stage.
- const TensorInfo out
- {
- in
- };
+ const TensorInfo out{in};
return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
}
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
- CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context,
+ CLGEMMLowpMatrixMultiplyCore &mm,
+ CLGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ICLTensor *mm_input,
+ const ICLTensor *mm_weights,
+ const ICLTensor *bias,
+ CLTensor *mm_res,
+ CLTensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info)
{
_memory_group.manage(mm_res);
_memory_group.manage(outstage_res);
@@ -152,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML
mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
// Configure output stage
- quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
mm_res->allocator()->allocate();
}
-void CLQLSTMLayer::configure(const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
- cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+ output_state_in, cell_state_out, output_state_out, output, lstm_params);
}
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
- const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
- const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
- const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
- ICLTensor *cell_state_in, ICLTensor *output_state_in,
- ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights,
+ const ICLTensor *input_to_cell_weights,
+ const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights,
+ const ICLTensor *recurrent_to_cell_weights,
+ const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias,
+ const ICLTensor *cell_bias,
+ const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in,
+ ICLTensor *output_state_in,
+ ICLTensor *cell_state_out,
+ ICLTensor *output_state_out,
+ ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -183,16 +213,20 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
cell_state_out, output_state_out, output);
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out, output, lstm_params);
// Set lstm parameters
LSTMParams<ITensorInfo> lstm_params_info{};
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
- lstm_params_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
const int batch_size = input->info()->dimension(1);
const int num_units = input_to_output_weights->info()->dimension(1);
@@ -213,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
// Layer normalization
_has_layer_norm = lstm_params.use_layer_norm();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -235,49 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
_has_cell_clipping = quantized_cell_clip > 0;
// Precompute effective bias for optimizing the matmul computations.
- if(!_has_cifg)
+ if (!_has_cifg)
{
_input_to_input_weights = lstm_params.input_to_input_weights();
_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
- _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+ _input_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction->configure(
+ compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
}
- _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- if(_has_projection)
+ _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+ _input_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction->configure(
+ compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction->configure(
+ compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+ _input_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction->configure(
+ compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ if (_has_projection)
{
- _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
- if(_projection_bias != nullptr)
+ _projection_reduction->configure(
+ compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ if (_projection_bias != nullptr)
{
- _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+ _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+ &_projection_eff_bias, ConvertPolicy::SATURATE);
}
}
// Pre-transpose weights to be used in GEMM.
- _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
- _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
- _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
- _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
- _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
- _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
- if(!_has_cifg)
+ _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+ &_input_to_forget_weights_transposed);
+ _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+ &_input_to_cell_weights_transposed);
+ _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+ &_input_to_output_weights_transposed);
+ _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+ &_recurrent_to_forget_weights_transposed);
+ _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+ &_recurrent_to_cell_weights_transposed);
+ _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+ &_recurrent_to_output_weights_transposed);
+ if (!_has_cifg)
{
- _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
- _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+ _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+ &_input_to_input_weights_transposed);
+ _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+ &_recurrent_to_input_weights_transposed);
}
- if(_has_projection)
+ if (_has_projection)
{
_transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
}
@@ -290,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
// Forget gate.
- const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
- const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
- input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
- &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+ &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+ &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
&_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
mm_out_info, forget_gate_outstage_info);
- _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
ConvertPolicy::SATURATE);
_input_to_forget_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
_mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_forget_res);
- _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ _cell_to_forget_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_forget_outstage_res);
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+ const float cell_to_forget_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+ &_cell_to_forget_outstage_res, gemmlowp_info);
_mul_cell_to_forget_res.allocator()->allocate();
- _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+ &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
ConvertPolicy::SATURATE);
_cell_to_forget_outstage_res.allocator()->allocate();
}
CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
_recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -338,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_forget_gate);
_forget_gate.allocator()->init(forget_gate_info);
- _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
forget_activation_input->allocator()->allocate();
// Modulation gate.
- const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
- input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
- &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
- mm_out_info, cell_outstage_info);
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
- &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
- mm_out_info, cell_outstage_info);
-
- _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+ const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+ &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+ &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+ &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
+
+ _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+ &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
ConvertPolicy::SATURATE);
_input_to_cell_outstage_res.allocator()->allocate();
CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
_recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -371,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_cell_gate);
_cell_gate.allocator()->init(cell_gate_info);
- _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
cell_activation_input->allocator()->allocate();
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_input_gate.allocator()->init(input_gate_info);
_memory_group.manage(&_input_gate);
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.allocator()->init(*_forget_gate.info());
_input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -386,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
}
else
{
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
- input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
- &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
- mm_out_info, input_outstage_info);
-
- const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+ &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+ &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+ const float recurrent_to_input_scale =
+ _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
mm_out_info, input_outstage_info);
- _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
- ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_input_to_input_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
- _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _mul_cell_to_input_res.allocator()->init(
+ TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_input_res);
- _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+ &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ const float cell_to_input_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_input_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_input_outstage_res);
- _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+ _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+ &_cell_to_input_outstage_res, gemmlowp_info);
_mul_cell_to_input_res.allocator()->allocate();
- _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_cell_to_input_outstage_res.allocator()->allocate();
}
CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
_recurrent_to_input_outstage_res.allocator()->allocate();
input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
}
- _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
input_activation_input->allocator()->allocate();
}
// Cell.
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
- _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
- const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+ const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(mul_input_cell_scale, 0));
_memory_group.manage(&_mul_input_cell_res);
_mul_input_cell_res.allocator()->init(mul_input_cell_info);
- _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_gate.allocator()->allocate();
- _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+ _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+ ConvertPolicy::SATURATE);
_mul_input_cell_res.allocator()->allocate();
_forget_gate.allocator()->allocate();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
- _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+ _cell_clip.configure(compile_context, cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip));
}
// Output gate.
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
- input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
- &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
- mm_out_info, output_outstage_info);
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.output_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+ &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+ &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
&_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
mm_out_info, output_outstage_info);
- _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+ &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
ConvertPolicy::SATURATE);
_input_to_output_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
_mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_output_res);
- _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
- const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+ &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+
+ const float cell_to_output_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.output_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_output_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_output_outstage_res);
- _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+ _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+ &_cell_to_output_outstage_res, gemmlowp_info);
_mul_cell_to_output_res.allocator()->allocate();
- _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+ &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
ConvertPolicy::SATURATE);
_cell_to_output_outstage_res.allocator()->allocate();
}
CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
_recurrent_to_output_outstage_res.allocator()->allocate();
@@ -496,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_output_gate);
_output_gate.allocator()->init(output_gate_info);
- _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
output_activation_input->allocator()->allocate();
// Hidden.
- _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
// TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
_memory_group.manage(&_hidden_mul_res);
const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
_hidden_mul_res.allocator()->init(hidden_mul_res);
- _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_output_gate.allocator()->allocate();
_input_gate.allocator()->allocate();
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = output_state_in->info()->data_type();
@@ -518,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_memory_group.manage(&_hidden_gate);
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->init(*output_state_out->info());
_hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -529,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_hidden_mul_res.allocator()->allocate();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
const TensorInfo projection_outstage_info(*output_state_out->info());
- const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
- gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
- gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
- gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
-
- TensorInfo projection_mm_out_info{ mm_out_info };
+ const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
- hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
- &_mm_projection_res, &_projection_outstage_res, projection_scale,
- projection_mm_out_info, projection_outstage_info);
+ configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+ &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+ &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
ICLTensor *accumulate_destination = output_state_out;
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->allocate();
_projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -558,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
accumulate_destination = &_projection_accumulate_res;
}
- _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+ _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+ accumulate_destination, ConvertPolicy::SATURATE);
_projection_outstage_res.allocator()->allocate();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
_projection_accumulate_res.allocator()->allocate();
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
- quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+ quantized_projection_clip =
+ utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip));
+ _projection_clip.configure(compile_context, output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip));
_has_projection_clipping = true;
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
_hidden_gate.allocator()->allocate();
@@ -593,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
_copy_output.configure(compile_context, output_state_out, output);
}
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
- cell_state_out, output_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ cell_state_in, output_state_in, cell_state_out, output_state_out, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -615,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+ input_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -640,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
// Check whether peephole weights are all there or none
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
}
}
@@ -667,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
@@ -675,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Precompute effective bias for optimizing the matmul computations.
const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.input_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_forget_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_cell_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_output_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
- lstm_params.hidden_state_zero(),
- true)));
- if(lstm_params.projection_bias() != nullptr)
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.projection_weights(), &projection_eff_bias_info,
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+ if (lstm_params.projection_bias() != nullptr)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
- &projection_eff_bias_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+ &projection_eff_bias_info, ConvertPolicy::SATURATE));
}
}
- const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
- const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+ const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+ input_to_forget_weights->data_type(),
+ input_to_forget_weights->quantization_info());
+ const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
// Validate weights transpose
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -710,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
}
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
}
GEMMLowpOutputStageInfo gemmlowp_info;
@@ -731,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
// Forget gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
- const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
- const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_forget_scale, &mm_out_info, &forget_outstage_info));
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+ &forget_outstage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_forget_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
const ITensorInfo *b_info = forget_gate_bias;
@@ -763,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Modulation gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
- const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
- if(has_layer_norm)
+ const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+ &cell_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+ &cell_outstage_info, ConvertPolicy::SATURATE));
+
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
const ITensorInfo *b_info = cell_bias;
@@ -784,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+ "Input gate bias must not be present when CIFG is used");
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+ &forget_gate_info, ConvertPolicy::SATURATE));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+ lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
- const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
- if(lstm_params.has_peephole_opt())
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ const float recurrent_to_input_scale =
+ lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+ &input_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
+
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+ 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_input_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
const ITensorInfo *b_info = lstm_params.input_gate_bias();
ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ &input_outstage_info, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
}
// Cell.
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
- if(quantized_cell_clip > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+ if (quantized_cell_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
- quantized_cell_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip)));
}
// Output gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+ &output_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+ DataType::QSYMM16);
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
// Here we are not using the output stage because all operations are done in float
// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
const ITensorInfo *b_info = output_gate_bias;
@@ -870,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Hidden.
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLActivationLayer::validate(cell_state_out, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+ &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = hidden_out_info.data_type();
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
const bool projection_tensor_copy_required = num_units != output_size;
// Projection.
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+ lstm_params.projection_weights());
ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
- const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
const TensorInfo projection_outstage_info(*output_state_out);
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
- TensorInfo projection_mm_out_info{ mm_out_info };
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+ &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
&projection_outstage_info));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+ ConvertPolicy::SATURATE));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+ output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip)));
}
}
else
{
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
}
}
- if(cell_state_out->total_size() > 0)
+ if (cell_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
}
- if(output_state_out->total_size() > 0)
+ if (output_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -973,14 +1210,14 @@ void CLQLSTMLayer::run()
_recurrent_to_forget_outstage.run();
_accumulate_input_recurrent_forget.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_forget.run();
_cell_to_forget_outstage.run();
_accumulate_cell_forget.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
}
@@ -995,7 +1232,7 @@ void CLQLSTMLayer::run()
_recurrent_to_cell_outstage.run();
_accumulate_input_recurrent_modulation.run();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
}
@@ -1003,7 +1240,7 @@ void CLQLSTMLayer::run()
_cell_gate_tanh.run();
// Input gate
- if(_has_cifg)
+ if (_has_cifg)
{
_input_gate_sub.run();
}
@@ -1015,14 +1252,14 @@ void CLQLSTMLayer::run()
_recurrent_to_input_outstage.run();
_accumulate_input_recurrent_input.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_input.run();
_cell_to_input_outstage.run();
_accumulate_cell_input.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
}
@@ -1034,7 +1271,7 @@ void CLQLSTMLayer::run()
_pixelwise_mul_forget_cell.run();
_pixelwise_mul_input_cell.run();
_add_forget_cell.run();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
_cell_clip.run();
}
@@ -1045,14 +1282,14 @@ void CLQLSTMLayer::run()
_mm_recurrent_to_output.run();
_recurrent_to_output_outstage.run();
_accumulate_input_recurrent_output.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_output.run();
_cell_to_output_outstage.run();
_accumulate_cell_to_output.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
}
@@ -1065,31 +1302,31 @@ void CLQLSTMLayer::run()
_hidden_outstage.run();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
_mm_projection.run();
_projection_outstage.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_output_to_accumulate_copy.run();
}
_accumulate_projection.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.run();
}
- if(_has_projection_clipping)
+ if (_has_projection_clipping)
{
_projection_clip.run();
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.run();
}
@@ -1101,7 +1338,7 @@ void CLQLSTMLayer::run()
void CLQLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Pre-transpose weights to be used in GEMM.
_input_to_forget_weights_transposed.allocator()->allocate();
@@ -1118,18 +1355,25 @@ void CLQLSTMLayer::prepare()
_transpose_recurrent_to_output_weights.run();
// Precompute effective biases
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.map(true);
- std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+ std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 32767);
_ones.unmap();
}
else
{
_input_to_input_eff_bias.allocator()->allocate();
_recurrent_to_input_eff_bias.allocator()->allocate();
- CLScheduler::get().enqueue(*_input_to_input_reduction);
- CLScheduler::get().enqueue(*_recurrent_to_input_reduction);
+
+ ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+ {ACL_DST, &_input_to_input_eff_bias}};
+ CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
+
+ ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+ {ACL_DST, &_recurrent_to_input_eff_bias}};
+ CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
_input_to_input_weights_transposed.allocator()->allocate();
_recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1144,18 +1388,36 @@ void CLQLSTMLayer::prepare()
_recurrent_to_cell_eff_bias.allocator()->allocate();
_input_to_output_eff_bias.allocator()->allocate();
_recurrent_to_output_eff_bias.allocator()->allocate();
- CLScheduler::get().enqueue(*_input_to_forget_reduction);
- CLScheduler::get().enqueue(*_recurrent_to_forget_reduction);
- CLScheduler::get().enqueue(*_input_to_cell_reduction);
- CLScheduler::get().enqueue(*_recurrent_to_cell_reduction);
- CLScheduler::get().enqueue(*_input_to_output_reduction);
- CLScheduler::get().enqueue(*_recurrent_to_output_reduction);
-
- if(_has_projection)
+
+ ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+ {ACL_DST, &_input_to_forget_eff_bias}};
+ CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
+
+ ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+ {ACL_DST, &_recurrent_to_forget_eff_bias}};
+ CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
+
+ ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
+ CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
+
+ ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+ {ACL_DST, &_recurrent_to_cell_eff_bias}};
+ CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
+
+ ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+ {ACL_DST, &_input_to_output_eff_bias}};
+ CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
+
+ ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+ {ACL_DST, &_recurrent_to_output_eff_bias}};
+ CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
+
+ if (_has_projection)
{
_projection_eff_bias.allocator()->allocate();
- CLScheduler::get().enqueue(*_projection_reduction);
- if(_projection_bias != nullptr)
+ ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
+ CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
+ if (_projection_bias != nullptr)
{
_projection_bias_add.run();
_projection_bias->mark_as_unused();
@@ -1165,7 +1427,7 @@ void CLQLSTMLayer::prepare()
_transpose_projection_weights.run();
_projection_weights->mark_as_unused();
- if(!_projection_tensor_copy_required)
+ if (!_projection_tensor_copy_required)
{
_hidden_gate.mark_as_unused();
_projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index e6451b2eb4..6edef29992 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -25,20 +25,20 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClQuantize.h"
+#include "src/gpu/cl/operators/ClQuantize.h"
namespace arm_compute
{
struct CLQuantizationLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClQuantize> op{nullptr};
};
-CLQuantizationLayer::CLQuantizationLayer()
- : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
{
}
CLQuantizationLayer::~CLQuantizationLayer() = default;
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 755fa40121..34b78eefa7 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,27 +28,37 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
namespace arm_compute
{
using namespace arm_compute::misc::shape_calculator;
CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+ : _memory_group(std::move(memory_manager)),
+ _gemm_state_f(),
+ _add_kernel(),
+ _activation(),
+ _fully_connected_kernel(),
+ _copy(),
+ _fully_connected_out(),
+ _gemm_output(),
+ _add_output(),
_is_prepared(false)
{
}
CLRNNLayer::~CLRNNLayer() = default;
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
- const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -66,28 +76,43 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
- auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+ auto shape_info =
+ TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
return Status{};
}
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
ActivationLayerInfo &info)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+ output, info);
}
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
- ICLTensor *hidden_state,
- ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *recurrent_weights,
+ const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output,
+ ActivationLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+ bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
@@ -135,7 +160,7 @@ void CLRNNLayer::run()
void CLRNNLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_fully_connected_kernel.prepare();
_gemm_state_f.prepare();
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 291ccff958..1939d1d0ba 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,25 +24,39 @@
#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
#include "arm_compute/core/CL/ICLArray.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
namespace arm_compute
{
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
return Status{};
}
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
// Configure ROI pooling kernel
auto k = std::make_unique<CLROIAlignLayerKernel>();
k->configure(compile_context, input, rois, output, pool_info);
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index cf7d4bcbc3..0d2eab0c76 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,23 +22,38 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
#include "arm_compute/core/CL/ICLArray.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
using namespace arm_compute;
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
}
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input,
+ const ICLTensor *rois,
+ ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
}
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *rois,
+ const ICLTensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
// Configure ROI pooling kernel
auto k = std::make_unique<CLROIPoolingLayerKernel>();
k->configure(compile_context, input, rois, output, pool_info);
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index d4735c875d..5c3f7f9c8c 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLRangeKernel.h"
using namespace arm_compute;
@@ -36,8 +38,10 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c
configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
}
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+ const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
{
+ ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
auto k = std::make_unique<CLRangeKernel>();
k->set_target(CLScheduler::get().target());
k->configure(compile_context, output, start, end, step);
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index b761dc2f99..bef8d887fd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
@@ -36,12 +38,14 @@ namespace arm_compute
{
namespace
{
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
@@ -49,29 +53,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
//axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
// Only validate if not using auto_init for the output tensor
TensorShape out_shape = input->tensor_shape();
// Validate output_shape only if not using auto_init
convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+ for (unsigned int i = 0; i < reduction_ops; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
+ if (output->total_size() > 0 && keep_dims)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
}
- if(keep_dims)
+ if (keep_dims)
{
out_shape.set(axis_local[i], 1);
}
@@ -80,13 +91,14 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
const unsigned int remove_index = axis_local[i] - i;
ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
- out_shape.remove_dimension(remove_index);
+ out_shape.remove_dimension(remove_index, false);
}
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
- if(requant)
+ const bool requant =
+ is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+ if (requant)
{
TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
CLDequantizationLayer::validate(input, &input_no_quant);
@@ -96,10 +108,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
}
return Status{};
}
-}
+} // namespace
CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+ : _memory_group(std::move(memory_manager)),
+ _reduction_kernels(),
+ _reduced_outs(),
+ _reshape(),
+ _dequant(),
+ _requant(),
+ _reduction_ops(),
+ _keep_dims(),
+ _do_requant(),
+ _input_no_quant(),
_output_no_quant()
{
}
@@ -109,15 +130,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
}
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ ICLTensor *output)
{
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
+
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+ _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+ input->info()->quantization_info() != output->info()->quantization_info();
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -125,7 +154,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
ICLTensor *tmp_input = input;
ICLTensor *tmp_output = output;
- if(_do_requant)
+ if (_do_requant)
{
_memory_group.manage(&_input_no_quant);
_memory_group.manage(&_output_no_quant);
@@ -144,46 +173,57 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(int i = 0; i < _reduction_ops; ++i)
+ for (int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ TensorShape out_shape =
+ i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
- if(i == _reduction_ops - 1 && keep_dims)
+ if (i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+ ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+ tmp_input->info()->data_type(),
+ tmp_input->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
- _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+ ReductionOperation::MEAN_SUM);
}
}
// Allocate intermediate tensors
- for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
// Configure reshape layer if we want to drop the dimensions
- if(!_keep_dims)
+ if (!_keep_dims)
{
TensorShape out_shape = tmp_input->info()->tensor_shape();
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(int i = 0; i < _reduction_ops; ++i)
+#pragma GCC diagnostic pop
+ for (int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_local[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i, false);
}
auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
}
- if(_do_requant)
+ if (_do_requant)
{
_requant.configure(compile_context, &_output_no_quant, output);
_input_no_quant.allocator()->allocate();
@@ -191,7 +231,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
}
}
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ const ITensorInfo *output)
{
return validate_config(input, reduction_axis, keep_dims, output);
}
@@ -200,19 +243,19 @@ void CLReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_do_requant)
+ if (_do_requant)
{
_dequant.run();
}
- for(auto &kernel : _reduction_kernels)
+ for (auto &kernel : _reduction_kernels)
{
kernel.run();
}
- if(!_keep_dims)
+ if (!_keep_dims)
{
_reshape.run();
}
- if(_do_requant)
+ if (_do_requant)
{
_requant.run();
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 61859f8de8..ba5489018e 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -27,9 +27,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLReductionOperationKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/runtime/Utils.h"
@@ -37,23 +39,31 @@
namespace arm_compute
{
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+ : _memory_group(std::move(memory_manager)),
+ _unreshaped_output(),
+ _reduction_kernel(),
+ _reshape(),
+ _reduction_axis(),
+ _is_reshape_required(false)
{
}
CLReductionOperation::~CLReductionOperation() = default;
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+ const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
const bool is_reshape_required = !keep_dims;
- if(is_reshape_required && output->total_size() != 0)
+ if (is_reshape_required && output->total_size() != 0)
{
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
}
@@ -65,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
const auto input_qinfo = input->quantization_info();
const auto output_data_type = output->data_type();
- auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
- {
+ auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+ QuantizationInfo qinfo) {
ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
};
- if(is_reshape_required)
+ if (is_reshape_required)
{
auto shape_before_reshape = input_shape;
shape_before_reshape.set(axis, 1);
- initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+ initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+ input_qinfo);
output_internal = &output_before_reshape;
}
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
- if(is_reshape_required)
+ if (is_reshape_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
}
@@ -90,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
{
- if(!_is_reshape_required)
+ if (!_is_reshape_required)
{
return output;
}
@@ -101,24 +112,37 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor
return &_unreshaped_output;
}
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+ ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
}
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ unsigned int axis,
+ ReductionOperation op,
+ bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
_reduction_axis = axis;
_is_reshape_required = !keep_dims;
auto *output_internal = configure_intermediate_result_vector(input, output);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- const auto output_data_type = input->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = input->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
_memory_group.manage(&_unreshaped_output);
}
@@ -126,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
_reduction_kernel = std::make_unique<CLReductionOperationKernel>();
_reduction_kernel->configure(compile_context, input, output_internal, axis, op);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.configure(compile_context, &_unreshaped_output, output);
_unreshaped_output.allocator()->allocate();
@@ -139,7 +163,7 @@ void CLReductionOperation::run()
CLScheduler::get().enqueue(*_reduction_kernel, false);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.run();
}
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
deleted file mode 100644
index 0a1f864543..0000000000
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLRemapKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value);
-}
-
-void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
- BorderMode border_mode,
- uint8_t constant_border_value)
-{
- auto k = std::make_unique<CLRemapKernel>();
- k->configure(compile_context, input, map_x, map_y, output, RemapInfo{ policy, border_mode, PixelValue(constant_border_value) });
- _kernel = std::move(k);
- _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 69b28abab3..156e9b90c1 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLReorgLayerKernel.h"
#include <utility>
@@ -38,8 +40,12 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride
configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
}
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ int32_t stride)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, stride);
auto k = std::make_unique<CLReorgLayerKernel>();
k->configure(compile_context, input, output, stride);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 060eddb96c..3d6349fb25 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -27,25 +27,25 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClReshape.h"
+#include "src/gpu/cl/operators/ClReshape.h"
/** [CLReshapeLayer snippet] **/
namespace arm_compute
{
struct CLReshapeLayer::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClReshape> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClReshape> op{nullptr};
};
-CLReshapeLayer::CLReshapeLayer()
- : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
{
}
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
CLReshapeLayer::~CLReshapeLayer() = default;
@@ -78,4 +78,4 @@ void CLReshapeLayer::run()
_impl->op->run(pack);
}
} // namespace arm_compute
-/** [CLReshapeLayer snippet] **/ \ No newline at end of file
+ /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 2a845bae13..a20be2335a 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,24 +24,34 @@
#include "arm_compute/runtime/CL/functions/CLReverse.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLReverseKernel.h"
namespace arm_compute
{
-void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
}
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const ICLTensor *axis,
+ bool use_inverted_axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis);
auto k = std::make_unique<CLReverseKernel>();
- k->configure(compile_context, input, output, axis);
+ k->configure(compile_context, input, output, axis, use_inverted_axis);
_kernel = std::move(k);
}
-Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverse::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
- return CLReverseKernel::validate(input, output, axis);
+ return CLReverseKernel::validate(input, output, axis, use_inverted_axis);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index cbd93c1086..abff0724e4 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,20 +26,20 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClScale.h"
+#include "src/gpu/cl/operators/ClScale.h"
namespace arm_compute
{
struct CLScale::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClScale> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClScale> op{nullptr};
};
-CLScale::CLScale()
- : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
{
}
CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn
configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
}
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ ICLTensor *output,
+ const ScaleKernelInfo &info)
{
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLScatter.cpp b/src/runtime/CL/functions/CLScatter.cpp
new file mode 100644
index 0000000000..e16fcc4ccc
--- /dev/null
+++ b/src/runtime/CL/functions/CLScatter.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLScatter.h"
+
+#include "arm_compute/function_info/ScatterInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/operators/ClScatter.h"
+
+namespace arm_compute
+{
+using OperatorType = opencl::ClScatter;
+
+struct CLScatter::Impl
+{
+ std::unique_ptr<OperatorType> op{nullptr};
+ ITensorPack run_pack{};
+};
+
+CLScatter::CLScatter() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLScatter::~CLScatter() = default;
+
+void CLScatter::configure(const ICLTensor *src,
+ const ICLTensor *updates,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const ScatterInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ configure(CLKernelLibrary::get().get_compile_context(), src, updates, indices, output, info);
+}
+
+void CLScatter::configure(const CLCompileContext &compile_context,
+ const ICLTensor *src,
+ const ICLTensor *updates,
+ const ICLTensor *indices,
+ ICLTensor *output,
+ const ScatterInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, output);
+
+ _impl->op = std::make_unique<OperatorType>();
+ if (src)
+ { // Src not nullptr.
+ _impl->op->configure(compile_context, src->info(), updates->info(), indices->info(), output->info(), info);
+ }
+ else
+ {
+ _impl->op->configure(compile_context, nullptr, updates->info(), indices->info(), output->info(), info);
+ }
+ _impl->run_pack = {{ACL_SRC_0, src}, {ACL_SRC_1, updates}, {ACL_SRC_2, indices}, {ACL_DST, output}};
+}
+
+Status CLScatter::validate(const ITensorInfo *src,
+ const ITensorInfo *updates,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const ScatterInfo &info)
+{
+ return OperatorType::validate(src, updates, indices, output, info);
+}
+
+void CLScatter::run()
+{
+ _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index 5ec18a032f..b4897d9e62 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLSelectKernel.h"
using namespace arm_compute;
@@ -36,8 +38,13 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor
configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
}
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+ const ICLTensor *c,
+ const ICLTensor *x,
+ const ICLTensor *y,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
auto k = std::make_unique<CLSelectKernel>();
k->configure(compile_context, c, x, y, output);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7f39143dc7..f79c6a1235 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,15 +26,22 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -44,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn
_kernel = std::move(k);
}
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
// Check start dimensions for being non-negative
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
- {
- return i < 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -63,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
struct CLSlice::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<experimental::CLSlice> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<experimental::CLSlice> op{nullptr};
};
-CLSlice::CLSlice()
- : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
{
}
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&) = default;
CLSlice &CLSlice::operator=(CLSlice &&) = default;
CLSlice::~CLSlice() = default;
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
return experimental::CLSlice::validate(input, output, starts, ends);
}
@@ -86,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin
configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
}
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index de58bf1b02..2e70e2aa08 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -22,16 +22,18 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
+
#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-#include "src/runtime/gpu/cl/operators/ClSoftmax.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/operators/ClSoftmax.h"
namespace arm_compute
{
@@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax;
template <bool IS_LOG>
struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
MemoryGroup memory_group{};
ITensorPack run_pack{};
WorkspaceData<CLTensor> workspace_tensors{};
@@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
}
template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+ const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
{
_impl->src = input;
_impl->dst = output;
_impl->op = std::make_unique<OperatorType>();
- SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+ SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
_impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
_impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
{
- SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+ SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
return OperatorType::validate(*input, *output, softmax_info);
}
template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
{
// Acquire all the temporaries
MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 6180f4de07..37f728895f 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,67 +29,100 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
namespace arm_compute
{
CLSpaceToBatchLayer::CLSpaceToBatchLayer()
- : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
- _fill(),
- _has_padding(false)
+ : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
{
}
CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const ICLTensor *block_shape,
+ const ICLTensor *paddings,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _fill.configure(compile_context, output,
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
_space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
}
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+ padding_right, output);
}
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
- const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _fill.configure(compile_context, output,
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
- _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+ output);
}
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
return Status{};
}
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -97,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
void CLSpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- if(_has_padding)
+ if (_has_padding)
{
//CLScheduler::get().enqueue(*_fill, true);
_fill.run();
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 842d5bc5cc..22695c9ef3 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,12 +29,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
namespace arm_compute
{
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
- : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
{
}
@@ -45,8 +46,12 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
}
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ int32_t block_shape)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
_space_to_depth_kernel->configure(compile_context, input, output, block_shape);
}
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371e3f..6be43cc5cd 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/core/helpers/AutoConfiguration.h"
namespace arm_compute
@@ -38,7 +39,7 @@ void CLSplit::run()
{
cl::CommandQueue q = CLScheduler::get().queue();
- for(unsigned i = 0; i < _num_outputs; ++i)
+ for (unsigned i = 0; i < _num_outputs; ++i)
{
_slice_functions[i].run();
}
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 3ef6a27675..c15496fc31 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include <complex>
-
#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -32,14 +30,16 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLStackLayerKernel.h"
+#include <complex>
+
namespace arm_compute
{
CLStackLayer::CLStackLayer() // NOLINT
- : _input(),
- _stack_kernels(),
- _num_inputs(0)
+ : _input(), _stack_kernels(), _num_inputs(0)
{
}
@@ -50,15 +50,19 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
}
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext &compile_context,
+ const std::vector<ICLTensor *> &input,
+ int axis,
+ ICLTensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output);
_num_inputs = input.size();
_stack_kernels.reserve(_num_inputs);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for (unsigned int i = 0; i < _num_inputs; i++)
{
_stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
_stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -76,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
const unsigned int num_inputs = input.size();
- for(unsigned int i = 0; i < num_inputs; i++)
+ for (unsigned int i = 0; i < num_inputs; i++)
{
// All the tensors must have the same rank
ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -89,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
void CLStackLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ for (unsigned i = 0; i < _num_inputs; i++)
{
CLScheduler::get().enqueue(*_stack_kernels[i], false);
}
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index fd3db9341a..c1953cc415 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,24 +25,38 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+ const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
auto k = std::make_unique<CLStridedSliceKernel>();
k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
_kernel = std::move(k);
}
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
}
@@ -50,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
struct CLStridedSlice::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- CLRuntimeContext *ctx{ nullptr };
- std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ CLRuntimeContext *ctx{nullptr};
+ std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
};
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
CLStridedSlice::~CLStridedSlice() = default;
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -83,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC
_impl->dst = output;
_impl->op = std::make_unique<experimental::CLStridedSlice>();
- _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+ end_mask, shrink_axis_mask);
}
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
void CLStridedSlice::run()
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index 818f10f1ac..4f86c4adfa 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLTile.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/kernels/CLTileKernel.h"
namespace arm_compute
@@ -32,8 +33,12 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple
configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
}
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ ICLTensor *output,
+ const Multiples &multiples)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
auto k = std::make_unique<CLTileKernel>();
k->configure(compile_context, input, output, multiples);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index 142cf73259..5a738f47ce 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -27,19 +27,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClTranspose.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
namespace arm_compute
{
struct CLTranspose::Impl
{
- const ICLTensor *src{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClTranspose> op{nullptr};
};
-CLTranspose::CLTranspose()
- : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
{
}
CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@ void CLTranspose::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 28d122b3cf..ddd83e7824 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
namespace
@@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
}
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start,
+ int32_t &slice_end_mask,
+ const unsigned int input_num_dimensions)
{
// Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
Coordinates slice_end;
slice_start.set_num_dimensions(input_num_dimensions);
slice_end.set_num_dimensions(input_num_dimensions);
- for(size_t k = 0; k < input_num_dimensions; ++k)
+ for (size_t k = 0; k < input_num_dimensions; ++k)
{
slice_start.set(k, 0);
slice_end.set(k, -1);
@@ -54,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
} // namespace
CLUnstack::CLUnstack() // NOLINT
- : _num_slices(0),
- _strided_slice_vector()
+ : _num_slices(0), _strided_slice_vector()
{
}
@@ -64,14 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
}
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext &compile_context,
+ const ICLTensor *input,
+ const std::vector<ICLTensor *> &output_vector,
+ int axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
- std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(t);
- return t->info();
- });
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+ [](ICLTensor *t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -84,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens
Coordinates slice_start;
int32_t slice_end_mask;
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
- for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ for (unsigned int slice = 0; slice < _num_slices; ++slice)
{
// Adjusts start and end coordinates to take a 2D slice at a time
slice_start.set(axis_u, slice);
- _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask, (1 << axis_u));
}
}
@@ -103,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
Coordinates slice_start;
int32_t slice_end_mask;
- for(size_t k = 0; k < num_slices; ++k)
+ for (size_t k = 0; k < num_slices; ++k)
{
slice_start.set(wrap_axis(axis, input), k);
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
- ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask,
+ (1 << wrap_axis(axis, input))));
}
return Status{};
}
void CLUnstack::run()
{
- for(unsigned i = 0; i < _num_slices; ++i)
+ for (unsigned i = 0; i < _num_slices; ++i)
{
_strided_slice_vector[i].run();
}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index f758c3d0b3..645f817030 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -26,25 +26,25 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/CL/ICLKernel.h"
#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
#include "support/Cast.h"
namespace arm_compute
{
struct CLWinogradConvolutionLayer::Impl
{
- const ICLTensor *src{ nullptr };
- const ICLTensor *weights{ nullptr };
- const ICLTensor *biases{ nullptr };
- ICLTensor *dst{ nullptr };
- std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+ const ICLTensor *src{nullptr};
+ const ICLTensor *weights{nullptr};
+ const ICLTensor *biases{nullptr};
+ ICLTensor *dst{nullptr};
+ std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
ITensorPack run_pack{};
- ITensorPack prep_pack{};
MemoryGroup memory_group{};
WorkspaceData<CLTensor> workspace_tensors{};
- bool is_prepared{ false };
+ bool is_prepared{false};
};
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -55,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
- bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+ enable_fast_math);
}
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context,
+ ICLTensor *input,
+ const ICLTensor *weights,
+ const ICLTensor *biases,
+ ICLTensor *output,
const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
_impl->src = input;
_impl->weights = weights;
@@ -71,22 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
_impl->dst = output;
_impl->op = std::make_unique<opencl::ClWinogradConv2d>();
- _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+ _impl->op->configure(compile_context, input->info(), weights->info(),
+ (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+ enable_fast_math);
- _impl->run_pack =
- {
- { TensorType::ACL_SRC_0, _impl->src },
- { TensorType::ACL_SRC_1, _impl->weights },
- { TensorType::ACL_SRC_2, _impl->biases },
- { TensorType::ACL_DST, _impl->dst }
- };
-
- _impl->prep_pack = { { TensorType::ACL_SRC_1, _impl->weights } };
- _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+ {TensorType::ACL_SRC_1, _impl->weights},
+ {TensorType::ACL_SRC_2, _impl->biases},
+ {TensorType::ACL_DST, _impl->dst}};
+ _impl->workspace_tensors =
+ manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
}
@@ -100,10 +114,14 @@ void CLWinogradConvolutionLayer::run()
void CLWinogradConvolutionLayer::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
- _impl->op->prepare(_impl->prep_pack);
+ _impl->op->prepare(_impl->run_pack);
+
+ // Release Preparation tensors
+ release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack);
+ _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1);
_impl->is_prepared = true;
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 390bb97665..4270165ab4 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -25,7 +25,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
#include <utility>
@@ -34,8 +35,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -44,131 +44,133 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Default configurations for Bifrost architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G71 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G52 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
// Mali-G76 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 },
- { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+ {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
const DataType data_type = params.data_type;
- switch(_target)
+ switch (_target)
{
case GPUTarget::G71:
- if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+ if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
{
- return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G76:
- if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+ if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
{
- return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G52:
- if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+ if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
{
- return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
default:
- if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+ if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
{
- return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1;
+ CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if((m > 1) && (n < 16))
+ if ((m > 1) && (n < 16))
{
- gemm_type = CLGEMMKernelType::RESHAPED_V1;
+ gemm_type = CLGEMMKernelType::RESHAPED;
}
- else if(m == 1)
+ else if (m == 1)
{
gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if((k > 256) && (m > 4))
+ if ((k > 256) && (m > 4))
{
constexpr float alpha = 3.2f;
constexpr float fact0 = 1.51f;
constexpr float fact1 = 1.66f;
constexpr float ops = 12.0f;
const float scale = k > 1024 ? 1.07f : 1.0f;
- gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+ gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+ ? CLGEMMKernelType::RESHAPED
+ : CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- gemm_type = CLGEMMKernelType::NATIVE_V1;
+ gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
}
const auto workload = static_cast<float>((m * n) / 20.0f);
- gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED_V1)) ? CLGEMMKernelType::RESHAPED : gemm_type;
+ gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+ : gemm_type;
}
return gemm_type;
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -179,15 +181,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned
}
else
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(k <= 496)
+ if (k <= 496)
{
- if(n <= 544)
+ if (n <= 544)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
}
else
{
- if(k <= 588)
+ if (k <= 588)
{
- if(k <= 552)
+ if (k <= 552)
{
- if(m <= 148)
+ if (m <= 148)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 278)
+ if (m <= 278)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
const float r_nk = static_cast<float>(n) / static_cast<float>(k);
const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
- if(r_mn <= 1.5469f)
+ if (r_mn <= 1.5469f)
{
- if(r_mk <= 0.8766f)
+ if (r_mk <= 0.8766f)
{
- if(r_mk <= 0.0211f)
+ if (r_mk <= 0.0211f)
{
- if(r_mnk <= 77.5833f)
+ if (r_mnk <= 77.5833f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_nk <= 0.0832f)
+ if (r_nk <= 0.0832f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mnk <= 193.0000f)
+ if (r_mnk <= 193.0000f)
{
- if(r_mn <= 0.9948f)
+ if (r_mn <= 0.9948f)
{
- if(r_mk <= 2.5453f)
+ if (r_mk <= 2.5453f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mn <= 17.7370f)
+ if (r_mn <= 17.7370f)
{
- if(r_mnk <= 1391.2875f)
+ if (r_mnk <= 1391.2875f)
{
- if(r_mk <= 2.9724f)
+ if (r_mk <= 2.9724f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_mnk <= 470.0000f)
+ if (r_mnk <= 470.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_nk <= 0.1381f)
+ if (r_nk <= 0.1381f)
{
- if(r_mnk <= 9040.5000f)
+ if (r_mnk <= 9040.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
else
{
- if(r_mn <= 5.6790f)
+ if (r_mn <= 5.6790f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
const float r_mn = static_cast<float>(m) / static_cast<float>(n);
const float r_nk = static_cast<float>(n) / static_cast<float>(k);
- if(k <= 212)
+ if (k <= 212)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_nk <= 0.4990234375f)
+ if (r_nk <= 0.4990234375f)
{
- if(k <= 1392)
+ if (k <= 1392)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 325)
+ if (m <= 325)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
}
else
{
- if(k <= 471)
+ if (k <= 471)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(r_mn <= 0.04475911520421505f)
+ if (r_mn <= 0.04475911520421505f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(n <= 127.0000f)
+ if (n <= 127.0000f)
{
- if(n <= 63.5000f)
+ if (n <= 63.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 3616.0000f)
+ if (m <= 3616.0000f)
{
- if(b <= 18.5000f)
+ if (b <= 18.5000f)
{
- if(m <= 2970.5000f)
+ if (m <= 2970.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 104.0000f)
+ if (k <= 104.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
else
{
- if(m <= 12.5000f)
+ if (m <= 12.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 104.0000f)
+ if (k <= 104.0000f)
{
- if(b <= 18.5000f)
+ if (b <= 18.5000f)
{
- if(m <= 490.0000f)
+ if (m <= 490.0000f)
{
- if(n <= 272.0000f)
+ if (n <= 272.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
else
{
- if(m <= 226.0000f)
+ if (m <= 226.0000f)
{
- if(n <= 140.0000f)
+ if (n <= 140.0000f)
{
- if(m <= 179.5000f)
+ if (m <= 179.5000f)
{
return CLGEMMKernelType::RESHAPED;
}
@@ -556,22 +563,18 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
+ ARM_COMPUTE_UNUSED(n);
+ ARM_COMPUTE_UNUSED(k);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
- if(m == 1)
+ if (m == 1)
{
- if(n > k)
- {
- return CLGEMMKernelType::NATIVE_V1;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
+ return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
@@ -580,7 +583,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int
}
else
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
}
} // namespace cl_gemm
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index b799de6967..673038a8db 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -26,7 +26,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
#include <utility>
@@ -35,8 +36,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Configurations for Midgard architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
const DataType data_type = params.data_type;
- if(gemm_configs.find(data_type) != gemm_configs.end())
+ if (gemm_configs.find(data_type) != gemm_configs.end())
{
return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
}
@@ -68,23 +67,26 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
ARM_COMPUTE_ERROR("Not supported data type");
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
// We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+ return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(n, k, b);
// We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+ return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 982748810d..851e23bc84 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include <map>
#include <utility>
@@ -34,8 +35,7 @@ namespace arm_compute
{
namespace cl_gemm
{
-CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu)
- : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
{
}
@@ -44,189 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec
// _target could be used in the future to have a dedicated heuristic for each GPU IP
ARM_COMPUTE_UNUSED(_target);
- using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
// Default configurations for Valhall architectures
- static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G77 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
// Mali-G78 configurations
- static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs =
- {
- { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 },
- { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 },
- { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
- { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
- };
+ static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+ // Mali-G710 and Mali-G610 configurations
+ static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+ // Mali-G715 and Mali-G615 configurations
+ static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+ {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+ {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+ {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+ {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
const DataType data_type = params.data_type;
- switch(_target)
+ switch (_target)
{
+ case GPUTarget::G710:
+ case GPUTarget::G610:
+ if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+ {
+ return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
+ }
+ ARM_COMPUTE_ERROR("Not supported data type");
+ case GPUTarget::G715:
+ case GPUTarget::G615:
+ if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+ {
+ return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
+ }
+ ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G78:
- if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+ if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
{
- return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
case GPUTarget::G77:
- if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+ if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
{
- return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
default:
- if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+ if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
{
- return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+ return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+ params.is_rhs_constant);
}
ARM_COMPUTE_ERROR("Not supported data type");
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
+ return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
+ return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
- if(!is_rhs_constant)
- {
- return CLGEMMKernelType::NATIVE_V1;
- }
+ ARM_COMPUTE_UNUSED(m, n, k, b);
- if(m == 1)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
+ return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
+}
- const float r_mn = static_cast<float>(m) / static_cast<float>(n);
- const float r_mk = static_cast<float>(m) / static_cast<float>(k);
- const float r_nk = static_cast<float>(n) / static_cast<float>(k);
- const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+ ARM_COMPUTE_UNUSED(m, n, k, b);
- if(r_mk <= 0.6817956566810608)
- {
- if(workload <= 801.6000061035156)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- if(r_mn <= 0.0839829258620739)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- if(r_mk <= 0.24917218834161758)
- {
- return CLGEMMKernelType::RESHAPED;
- }
- else
- {
- if(workload <= 2551.75)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- if(workload <= 5061.574951171875)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED;
- }
- }
- }
- }
- }
- }
- else
- {
- if(r_mk <= 4.849947690963745)
- {
- if(workload <= 17618.4501953125)
- {
- if(workload <= 5224.699951171875)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- if(r_nk <= 0.7933054566383362)
- {
- return CLGEMMKernelType::RESHAPED;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- }
- }
- else
- {
- if(workload <= 20275.2001953125)
- {
- return CLGEMMKernelType::RESHAPED;
- }
- else
- {
- if(r_mk <= 3.07421875)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED;
- }
- }
- }
- }
- else
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- }
+ return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(is_rhs_constant)
+ if (is_rhs_constant)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -236,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
- if(m == 1)
+ if (m == 1)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
- if(n <= 272.0000f)
+ if (n <= 272.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(k <= 471.0000f)
+ if (k <= 471.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 72.5000f)
+ if (m <= 72.5000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- if(m <= 90.5000f)
+ if (m <= 90.5000f)
{
return CLGEMMKernelType::RESHAPED;
}
else
{
- if(k <= 2448.0000f)
+ if (k <= 2448.0000f)
{
- if(n <= 756.0000f)
+ if (n <= 756.0000f)
{
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
@@ -295,16 +243,60 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int
}
}
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
{
ARM_COMPUTE_UNUSED(m, n, k, b);
- if(!is_rhs_constant)
+ if (!is_rhs_constant)
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+ if (!is_rhs_constant)
+ {
+ return default_f32(m, n, k, b, is_rhs_constant);
+ }
+
+ unsigned int best_m0;
+ unsigned int best_n0;
+
+ if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+ {
+ return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
+ }
+ else
+ {
+ return default_f32(m, n, k, b, is_rhs_constant);
+ }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+ if (!is_rhs_constant)
+ {
+ return g78_f16(m, n, k, b, is_rhs_constant);
+ }
+
+ unsigned int best_m0;
+ unsigned int best_n0;
+
+ if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+ {
+ return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
+ }
+ else
+ {
+ return g78_f16(m, n, k, b, is_rhs_constant);
+ }
+}
+
} // namespace cl_gemm
} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h
index c88fbcf557..e190295ee4 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,6 +50,9 @@ private:
CLGEMMKernelType g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
CLGEMMKernelType g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
CLGEMMKernelType g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ CLGEMMKernelType g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ CLGEMMKernelType g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+ CLGEMMKernelType g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
};
} // namespace cl_gemm
} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index 6189a324cf..98dd44b1bf 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CLGEMMKERNELSELECTION_H
-#define SRC_CLGEMMKERNELSELECTION_H
+#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
+#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
@@ -45,13 +46,14 @@ public:
*/
static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
{
- switch(get_arch_from_target(gpu))
+ switch (get_arch_from_target(gpu))
{
case GPUTarget::MIDGARD:
return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);
case GPUTarget::BIFROST:
return std::make_unique<CLGEMMDefaultTypeBifrost>(gpu);
case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
return std::make_unique<CLGEMMDefaultTypeValhall>(gpu);
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -60,4 +62,4 @@ public:
};
} // namespace cl_gemm
} // namespace arm_compute
-#endif /* SRC_CLGEMMKERNELSELECTION_H */
+#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b8437487f8..8df57197e2 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -27,11 +27,12 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h"
#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
#include "src/runtime/CL/mlgo/Utils.h"
@@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_
bool valid = false;
CLGEMMKernelType gemm_type{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+ to_string(gemm_type).c_str());
}
else
{
@@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery
{
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
- std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+ std::unique_ptr<IClGemmKernelConfig> gemm_config =
+ ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
@@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigReshapedOnlyRHS config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
// Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
- std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs,
- config.export_cl_image);
+ std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+ !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
{
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
- std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+ std::unique_ptr<IClGemmKernelConfig> gemm_config =
+ ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
@@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigReshaped config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
- std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs,
- config.transpose_rhs, config.export_cl_image);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
+ std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+ config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
@@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
- return GEMMConfigResult{ true, lhs_info, rhs_info };
+ return GEMMConfigResult{true, lhs_info, rhs_info};
}
GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
@@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
GEMMRHSMatrixInfo rhs_info;
mlgo::GEMMConfigNative config{};
const auto mlgo_heuristics = CLScheduler::get().gemm_heuristics();
- if(mlgo_heuristics != nullptr)
+ if (mlgo_heuristics != nullptr)
{
- std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+ std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+ mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
}
- if(valid)
+ if (valid)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+ to_string(config).c_str());
// Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
- std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+ std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+ query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
}
else
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
}
- return GEMMConfigResult{ valid, lhs_info, rhs_info };
+ return GEMMConfigResult{valid, lhs_info, rhs_info};
}
} // namespace auto_heuristics
} // namespace cl_gemm
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
index 020237b7f4..f544715e03 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
@@ -50,8 +50,7 @@ struct CommonQuery
/** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
struct GEMMTypeResult
{
- GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type)
- : valid{ valid }, gemm_type{ gemm_type }
+ GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
{
}
/** Test if the result is valid */
@@ -67,7 +66,7 @@ struct GEMMTypeResult
struct GEMMConfigResult
{
GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
- : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info }
+ : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
{
}
/** Test if the result is valid */
@@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query);
} // namespace cl_gemm
} // namespace arm_compute
-#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H \ No newline at end of file
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
index c451bd9062..08a7ee8c18 100644
--- a/src/runtime/CL/mlgo/Common.h
+++ b/src/runtime/CL/mlgo/Common.h
@@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType;
/** GEMM Configuration for Native kernel */
struct GEMMConfigNative
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
};
/** GEMM Configuration for Reshaped Only RHS kernel */
struct GEMMConfigReshapedOnlyRHS
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
- unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
- bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
- bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */
- bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+ bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+ bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */
+ bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
};
/** GEMM Configuration for Reshaped kernel */
struct GEMMConfigReshaped
{
- unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
- unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
- unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
- unsigned int v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
- unsigned int h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
- bool interleave_lhs{ false }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
- bool interleave_rhs{ false }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
- bool transpose_rhs{ false }; /**< True if the (k0xn0) block has to be transposed before been stored */
- bool export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+ unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+ unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+ unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+ unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+ unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+ bool interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+ bool interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+ bool transpose_rhs{false}; /**< True if the (k0xn0) block has to be transposed before been stored */
+ bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
};
} // namespace mlgo
} // namespace arm_compute
-#endif // SRC_RUNTIME_CL_MLGO_COMMON_H \ No newline at end of file
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
index 1c75cdc427..f7b706902b 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.cpp
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/runtime/CL/mlgo/HeuristicTree.h"
+
#include "arm_compute/core/Log.h"
#include "support/Cast.h"
@@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond)
// PRE: all features and ConditionalOps are valid
constexpr float eps = 0.0001f;
// Calculate all secondary features
- std::vector<std::pair<std::string, float>> cond_values
- {
- { "m", static_cast<float>(shape.m) },
- { "n", static_cast<float>(shape.n) },
- { "k", static_cast<float>(shape.k) },
- { "b", static_cast<float>(shape.b) },
- { "r_mn", static_cast<float>(shape.m) / shape.n },
- { "r_mk", static_cast<float>(shape.m) / shape.k },
- { "r_nk", static_cast<float>(shape.n) / shape.k },
- { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) },
- { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 }
- };
- auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(),
- [&cond](decltype(*cond_values.begin()) it)
- {
- return it.first == cond.feature;
- });
+ std::vector<std::pair<std::string, float>> cond_values{
+ {"m", static_cast<float>(shape.m)},
+ {"n", static_cast<float>(shape.n)},
+ {"k", static_cast<float>(shape.k)},
+ {"b", static_cast<float>(shape.b)},
+ {"r_mn", static_cast<float>(shape.m) / shape.n},
+ {"r_mk", static_cast<float>(shape.m) / shape.k},
+ {"r_nk", static_cast<float>(shape.n) / shape.k},
+ {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+ {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+ auto cond_value_pair_it =
+ std::find_if(cond_values.begin(), cond_values.end(),
+ [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
const float cond_value = cond_value_pair_it->second;
- switch(cond.op)
+ switch (cond.op)
{
case ConditionalOp::LT:
{
@@ -92,13 +89,12 @@ constexpr size_t HeuristicTree::_max_num_nodes;
constexpr size_t HeuristicTree::_max_query_depth;
constexpr HeuristicTree::NodeID HeuristicTree::_root;
-HeuristicTree::HeuristicTree()
- : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
{
}
HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
- : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{}
+ : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
{
}
@@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
// Root ID = 0;
auto cur_node = _tree.at(_root).get();
size_t depth = 0;
- while(cur_node->type() != NodeType::Leaf)
+ while (cur_node->type() != NodeType::Leaf)
{
- if(depth > _max_query_depth)
+ if (depth > _max_query_depth)
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+ _max_query_depth);
return std::make_pair(false, T{});
}
ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
- if(evaluate(shape, br_node->condition))
+ if (evaluate(shape, br_node->condition))
{
cur_node = _tree.at(br_node->true_node).get();
}
@@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
template <typename T>
bool HeuristicTree::add_leaf(NodeID id, T val)
{
- if(_tree.size() >= _max_num_nodes)
+ if (_tree.size() >= _max_num_nodes)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
return false;
}
- if(_tree.find(id) != _tree.end())
+ if (_tree.find(id) != _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
return false;
@@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val)
bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
{
- if(_tree.size() >= _max_num_nodes)
+ if (_tree.size() >= _max_num_nodes)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
return false;
}
- const std::set<std::string> supported_features =
- {
- "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"
- };
- const auto orig_feature = cond.feature;
- std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c)
- {
- return std::tolower(c);
- });
- if(supported_features.find(cond.feature) == supported_features.end())
+ const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+ const auto orig_feature = cond.feature;
+ std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+ [](char c) { return std::tolower(c); });
+ if (supported_features.find(cond.feature) == supported_features.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
return false;
}
- if(_tree.find(id) != _tree.end())
+ if (_tree.find(id) != _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
return false;
@@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID
bool HeuristicTree::check_if_structurally_correct() const
{
std::set<NodeID> visited;
- std::deque<NodeID> to_visit{ _root };
+ std::deque<NodeID> to_visit{_root};
- while(!to_visit.empty())
+ while (!to_visit.empty())
{
auto id = to_visit.front();
to_visit.pop_front();
- if(_tree.find(id) == _tree.end())
+ if (_tree.find(id) == _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
return false;
}
auto not_seen_before = visited.insert(id);
- if(!not_seen_before.second)
+ if (!not_seen_before.second)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
return false;
}
auto cur_node = _tree.at(id).get();
- if(cur_node->type() == NodeType::Branch)
+ if (cur_node->type() == NodeType::Branch)
{
auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
to_visit.push_back(br_node->true_node);
to_visit.push_back(br_node->false_node);
}
}
- if(visited.size() != _tree.size())
+ if (visited.size() != _tree.size())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
return false;
@@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const
bool HeuristicTree::check()
{
- if(_tree.empty())
+ if (_tree.empty())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
return false;
}
- if(_tree.find(_root) == _tree.end())
+ if (_tree.find(_root) == _tree.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
return false;
@@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap
/** Explicit template instantiation @relates HeuristicTree */
template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
/** Explicit template instantiation @relates HeuristicTree */
-template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
/** Explicit template instantiation @relates HeuristicTree */
template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
index d5c7de2215..a4f8c116b9 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.h
+++ b/src/runtime/CL/mlgo/HeuristicTree.h
@@ -25,6 +25,7 @@
#define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
#include "arm_compute/core/Types.h"
+
#include "src/runtime/CL/mlgo/Common.h"
#include <map>
@@ -84,7 +85,7 @@ public:
struct BranchNode : public Node
{
BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
- : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node }
+ : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
{
}
NodeType type() const override
@@ -100,8 +101,7 @@ public:
template <typename T>
struct LeafNode : public Node
{
- LeafNode(NodeID id, T val)
- : id{ id }, value{ val }
+ LeafNode(NodeID id, T val) : id{id}, value{val}
{
}
NodeType type() const override
@@ -177,22 +177,22 @@ public:
bool check();
private:
- static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query
- static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree
- static constexpr NodeID _root{ 0 }; // Root tree ID
+ static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+ static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+ static constexpr NodeID _root{0}; // Root tree ID
private:
bool check_if_structurally_correct() const;
private:
- TreeID _id; /**< Heuristic tree ID */
- HeuristicType _heuristic_type; /**< Heuristic type */
- std::string _ip_target; /**< IP target associated with the tree */
- DataType _data_type; /**< Data type associated with the tree */
- std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
+ TreeID _id; /**< Heuristic tree ID */
+ HeuristicType _heuristic_type; /**< Heuristic type */
+ std::string _ip_target; /**< IP target associated with the tree */
+ DataType _data_type; /**< Data type associated with the tree */
+ std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
};
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
index 80f3bb85e9..aed46cd80f 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
@@ -24,6 +24,7 @@
#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
#include "arm_compute/core/Log.h"
+
#include "src/runtime/CL/mlgo/MLGOParser.h"
#include "src/runtime/CL/mlgo/Utils.h"
@@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs)
}
bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
{
- return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs,
- rhs.export_cl_image);
+ return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+ std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
}
bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
{
- return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0,
- rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+ return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+ lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+ rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
}
constexpr size_t MLGOHeuristics::_max_num_trees;
-MLGOHeuristics::MLGOHeuristics()
- : _indices{}, _trees{}, _tree_valid{}, _valid{ false }
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
{
}
@@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
const auto invalid = GEMMType::RESHAPED;
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMType>(shape_query);
}
std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigNative{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigNative>(shape_query);
}
std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigReshapedOnlyRHS{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
}
std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+ to_string(query).c_str());
const auto invalid = GEMMConfigReshaped{};
- if(!_valid)
+ if (!_valid)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
- return { false, invalid };
+ return {false, invalid};
}
auto index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
- GEMMShape shape_query{ query.m, query.n, query.k, query.b };
- if(_trees.find(index) == _trees.end())
+ GEMMShape shape_query{query.m, query.n, query.k, query.b};
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
- return { false, invalid };
+ return {false, invalid};
}
return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
}
@@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c
bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
{
bool status;
- HeuristicTree *tree{ nullptr };
+ HeuristicTree *tree{nullptr};
std::tie(status, tree) = get_heuristic_tree(id);
- if(!status)
+ if (!status)
{
return status;
}
status = tree->check();
- if(!status)
+ if (!status)
{
return status;
}
@@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
bool MLGOHeuristics::check_all() const
{
// Tree validities are already checked and cached.
- bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v)
- {
- return !v.second;
- })
- == _tree_valid.end();
- if(!all_trees_are_checked)
+ bool all_trees_are_checked =
+ std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+ if (!all_trees_are_checked)
{
- ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo");
+ ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+ "tree is completed. This could also indicate there are no trees in the dotmlgo");
return false;
}
@@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const
std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
{
- if(_indices.find(id) == _indices.end())
+ if (_indices.find(id) == _indices.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
return std::make_pair(false, nullptr);
}
const auto index = _indices[id];
- if(_trees.find(index) == _trees.end())
+ if (_trees.find(index) == _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
return std::make_pair(false, nullptr);
@@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre
bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
{
- if(_indices.size() >= _max_num_trees)
+ if (_indices.size() >= _max_num_trees)
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
return false;
@@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
// PRE: correctness of t is guaranteed by the tree construction process
// Ensure unique id
const auto id = t.id();
- if(_indices.find(id) != _indices.end())
+ if (_indices.find(id) != _indices.end())
{
ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
return false;
@@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
// Ensure unique index
const auto index = t.index();
- if(_trees.find(index) != _trees.end())
+ if (_trees.find(index) != _trees.end())
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
return false;
@@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
std::ifstream fs;
fs.exceptions(std::ifstream::badbit);
fs.open(filename, std::ios::in);
- if(!fs.is_open())
+ if (!fs.is_open())
{
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str());
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+ filename.c_str());
return _valid = false;
}
return reload_from_stream(fs);
@@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
bool MLGOHeuristics::reload_from_stream(std::istream &in)
{
auto parsed = parser::parse_mlgo(in);
- if(!parsed.first)
+ if (!parsed.first)
{
ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
return _valid = false;
@@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in)
}
} // namespace mlgo
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
index aa21225959..6a491c5503 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.h
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h
@@ -135,16 +135,16 @@ public:
bool check_all() const;
private:
- static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/
+ static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
private:
// There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices; /**< A mapping from TreeID to Index */
std::map<HeuristicTree::Index, HeuristicTree> _trees; /**< A mapping from Index to HeuristicTree */
std::map<HeuristicTree::TreeID, bool> _tree_valid; /**< Result cache of the tree validity checks */
- bool _valid; /**< Overall validity */
+ bool _valid; /**< Overall validity */
};
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
index 625739e450..893daf2ed9 100644
--- a/src/runtime/CL/mlgo/MLGOParser.cpp
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp
@@ -22,19 +22,21 @@
* SOFTWARE.
*/
#include "src/runtime/CL/mlgo/MLGOParser.h"
+
#include "arm_compute/core/Log.h"
+
#include "src/runtime/CL/mlgo/Utils.h"
#include <sstream>
#define CHECK(parser_expr, valid_var) \
(parser_expr); \
- if(!valid_var) \
+ if (!valid_var) \
return;
#define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
(parser_expr); \
- if(!valid_var) \
+ if (!valid_var) \
return default_val;
#ifdef ARM_COMPUTE_LOGGING_ENABLED
@@ -53,8 +55,7 @@
valid_var = false; \
return default_val;
-#define LOG_TOKEN_POS(tokens, pos_var) \
- const auto pos_var = tokens.current_pos();
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
#else // ARM_COMPUTE_LOGGING_ENABLED
@@ -73,19 +74,12 @@ namespace
{
void ltrim(std::string &str)
{
- str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch)
- {
- return !std::isspace(ch);
- }));
+ str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
}
void rtrim(std::string &str)
{
- str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch)
- {
- return !std::isspace(ch);
- }).base(),
- str.end());
+ str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
}
void trim(std::string &str)
@@ -109,7 +103,7 @@ enum class ComparatorType
};
TokenStream::TokenStream(std::istream &s, const std::string &delims)
- : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{}
+ : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
{
read();
}
@@ -125,7 +119,7 @@ Token TokenStream::take()
ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
Token t = _tokens.front();
_tokens.pop_front();
- if(_tokens.empty())
+ if (_tokens.empty())
{
read();
}
@@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i)
ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
// NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
- while(_istream && _tokens.size() <= i)
+ while (_istream && _tokens.size() <= i)
{
read();
}
@@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i)
void advance(CharPosition &pos, char ch)
{
- if(ch == '\n')
+ if (ch == '\n')
{
pos.ln += 1;
pos.col = 0;
@@ -167,17 +161,16 @@ void TokenStream::read()
do
{
// Reached eof
- if(!_istream.get(ch))
+ if (!_istream.get(ch))
{
- if(!reached_end())
+ if (!reached_end())
{
_tokens.emplace_back(TokenType::End, "", _lookahead_pos);
}
return;
}
advance(_lookahead_pos, ch);
- }
- while(std::isspace(ch) || is_delim(ch));
+ } while (std::isspace(ch) || is_delim(ch));
// Read chars until we hit a delim or eof
auto orig_pos = _lookahead_pos;
auto tok = recognize_tok(ch);
@@ -190,41 +183,41 @@ void TokenStream::read()
Token TokenStream::recognize_tok(char ch)
{
- if(ch == '[')
+ if (ch == '[')
{
- return Token{ TokenType::L_List, "", _lookahead_pos };
+ return Token{TokenType::L_List, "", _lookahead_pos};
}
- else if(ch == ']')
+ else if (ch == ']')
{
- return Token{ TokenType::R_List, "", _lookahead_pos };
+ return Token{TokenType::R_List, "", _lookahead_pos};
}
- else if(ch == '.')
+ else if (ch == '.')
{
- return float_after_dp_st(std::string{ ch });
+ return float_after_dp_st(std::string{ch});
}
- else if(std::isdigit(ch))
+ else if (std::isdigit(ch))
{
- return num_st(std::string{ ch });
+ return num_st(std::string{ch});
}
else
{
- return text_st(std::string{ ch });
+ return text_st(std::string{ch});
}
}
Token TokenStream::num_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(ch == '.')
+ if (ch == '.')
{
return float_after_dp_st(value + ch);
}
- else if(!std::isdigit(ch))
+ else if (!std::isdigit(ch))
{
- if(!is_delim(ch) && !std::isspace(ch))
+ if (!is_delim(ch) && !std::isspace(ch))
{
rewind(_lookahead_pos);
_istream.unget();
@@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Int, value, _lookahead_pos };
+ return Token{TokenType::Int, value, _lookahead_pos};
}
Token TokenStream::float_after_dp_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(!std::isdigit(ch))
+ if (!std::isdigit(ch))
{
- if(!is_delim(ch) && !std::isspace(ch))
+ if (!is_delim(ch) && !std::isspace(ch))
{
rewind(_lookahead_pos);
_istream.unget();
@@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Float, value, _lookahead_pos };
+ return Token{TokenType::Float, value, _lookahead_pos};
}
Token TokenStream::text_st(std::string value)
{
char ch{};
- while(_istream.get(ch))
+ while (_istream.get(ch))
{
advance(_lookahead_pos, ch);
- if(is_delim(ch))
+ if (is_delim(ch))
{
break;
}
- if(ch == '[' || ch == ']')
+ if (ch == '[' || ch == ']')
{
rewind(_lookahead_pos);
_istream.unget();
@@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value)
}
value += ch;
}
- return Token{ TokenType::Text, value, _lookahead_pos };
+ return Token{TokenType::Text, value, _lookahead_pos};
}
bool TokenStream::reached_end() const
@@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::End)
+ if (tok.type != TokenType::End)
{
FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
}
@@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Int)
+ if (tok.type != TokenType::Int)
{
FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
}
@@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Int)
+ if (tok.type != TokenType::Int)
{
FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
}
@@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
- if(val < 0)
+ if (val < 0)
{
FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
}
@@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Float)
+ if (tok.type != TokenType::Float)
{
FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
}
@@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
auto tok = in.take();
- if(tok.type != TokenType::Text || tok.value.empty())
+ if (tok.type != TokenType::Text || tok.value.empty())
{
FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
}
@@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid)
bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
{
auto tok = in.peek();
- if(tok.type == TokenType::Text && tok.value == c_str)
+ if (tok.type == TokenType::Text && tok.value == c_str)
{
- if(take)
+ if (take)
{
in.take();
}
@@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
void expect_text(TokenStream &in, const std::string &str, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_text(in, str))
+ if (!accept_text(in, str))
{
FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
}
@@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid)
bool accept_l_list(TokenStream &in)
{
auto tok = in.peek();
- if(tok.type == TokenType::L_List)
+ if (tok.type == TokenType::L_List)
{
in.take();
return true;
@@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in)
void expect_l_list(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_l_list(in))
+ if (!accept_l_list(in))
{
FAIL_WITH_MSG(valid, pos, "Expect '['");
}
@@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid)
bool accept_r_list(TokenStream &in)
{
auto tok = in.peek();
- if(tok.type == TokenType::R_List)
+ if (tok.type == TokenType::R_List)
{
in.take();
return true;
@@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in)
void expect_r_list(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(!accept_r_list(in))
+ if (!accept_r_list(in))
{
FAIL_WITH_MSG(valid, pos, "Expect ']'");
}
@@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid)
ConditionalOp conditional_op(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "<="))
+ if (accept_text(in, "<="))
{
return ConditionalOp::LE;
}
- else if(accept_text(in, ">="))
+ else if (accept_text(in, ">="))
{
return ConditionalOp::GE;
}
- else if(accept_text(in, "=="))
+ else if (accept_text(in, "=="))
{
return ConditionalOp::EQ;
}
- else if(accept_text(in, "<"))
+ else if (accept_text(in, "<"))
{
return ConditionalOp::LT;
}
- else if(accept_text(in, ">"))
+ else if (accept_text(in, ">"))
{
return ConditionalOp::GT;
}
@@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid)
{
CHECK(expect_text(in, "ip-type", valid), valid);
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "gpu"))
+ if (accept_text(in, "gpu"))
{
;
}
- else if(accept_text(in, "cpu"))
+ else if (accept_text(in, "cpu"))
{
;
}
@@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid)
DataType data_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "f16"))
+ if (accept_text(in, "f16"))
{
return DataType::F16;
}
- else if(accept_text(in, "f32"))
+ else if (accept_text(in, "f32"))
{
return DataType::F32;
}
- else if(accept_text(in, "qasymm8"))
+ else if (accept_text(in, "qasymm8"))
{
return DataType::QASYMM8;
}
@@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid)
ComparatorType comparator_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "var"))
+ if (accept_text(in, "var"))
{
return ComparatorType::Var;
}
- else if(accept_text(in, "num"))
+ else if (accept_text(in, "num"))
{
return ComparatorType::Num;
}
- else if(accept_text(in, "enum"))
+ else if (accept_text(in, "enum"))
{
return ComparatorType::Enum;
}
@@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid)
HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "gemm-type", take))
+ if (accept_text(in, "gemm-type", take))
{
return HeuristicType::GEMM_Type;
}
- else if(accept_text(in, "gemm-config-native", take))
+ else if (accept_text(in, "gemm-config-native", take))
{
return HeuristicType::GEMM_Config_Native;
}
- else if(accept_text(in, "gemm-config-reshaped-only-rhs", take))
+ else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
{
return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
}
- else if(accept_text(in, "gemm-config-reshaped", take))
+ else if (accept_text(in, "gemm-config-reshaped", take))
{
return HeuristicType::GEMM_Config_Reshaped;
}
@@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
{
LOG_TOKEN_POS(in, pos);
auto ht = CHECK(heuristic_type(in, valid, false), valid);
- if(ht != expected_ht)
+ if (ht != expected_ht)
{
FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
}
@@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
GEMMType gemm_type(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "native"))
+ if (accept_text(in, "native"))
{
return GEMMType::NATIVE;
}
- else if(accept_text(in, "reshaped-only-rhs"))
+ else if (accept_text(in, "reshaped-only-rhs"))
{
return GEMMType::RESHAPED_ONLY_RHS;
}
- else if(accept_text(in, "reshaped"))
+ else if (accept_text(in, "reshaped"))
{
return GEMMType::RESHAPED;
}
@@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid)
const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigNative{ m0, n0, k0 };
+ return GEMMConfigNative{m0, n0, k0};
}
GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
@@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v
const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex };
+ return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
}
GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
@@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
- return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex };
+ return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
}
void gpu_priority(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "best-performance"))
+ if (accept_text(in, "best-performance"))
{
;
}
- else if(accept_text(in, "best-memory-usage"))
+ else if (accept_text(in, "best-memory-usage"))
{
;
}
@@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid)
void gpu_behavior(TokenStream &in, bool &valid)
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "static"))
+ if (accept_text(in, "static"))
{
;
}
- else if(accept_text(in, "dynamic"))
+ else if (accept_text(in, "dynamic"))
{
;
}
@@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid)
void free_vars(TokenStream &in, bool &valid)
{
CHECK(expect_l_list(in, valid), valid);
- while(!accept_r_list(in))
+ while (!accept_r_list(in))
{
CHECK(text_val(in, valid), valid);
}
@@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid)
void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
{
CHECK(expect_text(in, "<heuristics-table>", valid), valid);
- while(!accept_text(in, "</heuristics-table>"))
+ while (!accept_text(in, "</heuristics-table>"))
{
CHECK(heuristics_table_entry(in, h, valid), valid);
}
@@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid)
const auto c_o = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
const auto r_t = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
const auto r_v = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
- if(l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+ if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
{
- FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+ FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+ "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
}
- return Condition{ l_v, c_o, r_v };
+ return Condition{l_v, c_o, r_v};
}
void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
@@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
CHECK(expect_text(in, "<heuristic", valid), valid);
const auto tree_id = CHECK(uint_val(in, valid), valid);
CHECK(expect_text(in, ">", valid), valid);
- HeuristicTree *t = nullptr;
- std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
+ HeuristicTree *t = nullptr;
+ std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
const HeuristicType t_heuristic_type = std::get<0>(t->index());
- while(!accept_text(in, "</heuristic>"))
+ while (!accept_text(in, "</heuristic>"))
{
LOG_TOKEN_POS(in, pos);
- if(accept_text(in, "b"))
+ if (accept_text(in, "b"))
{
// Branch node
const auto id = CHECK(uint_val(in, valid), valid);
@@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
const auto f_id = CHECK(uint_val(in, valid), valid);
valid = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
}
- else if(accept_text(in, "l"))
+ else if (accept_text(in, "l"))
{
// Leaf node
const auto id = CHECK(uint_val(in, valid), valid);
@@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
// heuristic table). For now it remains as a step for validation.
LOG_TOKEN_POS(in, pos);
CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
- switch(t_heuristic_type)
+ switch (t_heuristic_type)
{
case HeuristicType::GEMM_Type:
{
@@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid)
MLGOHeuristics h;
CHECK_DEFAULT(header(in, valid), valid, h);
CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
- while(accept_text(in, "<heuristic", false))
+ while (accept_text(in, "<heuristic", false))
{
CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
}
@@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in)
#undef CHECK
#undef CHECK_DEFAULT
#undef FAIL_WITH_MSG
-#undef FAIL_WITH_MSG_DEFAULT \ No newline at end of file
+#undef FAIL_WITH_MSG_DEFAULT
diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
index 49d8b9c644..cffce8d6a1 100644
--- a/src/runtime/CL/mlgo/MLGOParser.h
+++ b/src/runtime/CL/mlgo/MLGOParser.h
@@ -98,15 +98,14 @@ struct CharPosition
return ln == other.ln && col == other.col;
}
- size_t ln{ 0 };
- size_t col{ 0 };
+ size_t ln{0};
+ size_t col{0};
};
/** Token */
struct Token
{
- Token(TokenType t, std::string v, CharPosition pos)
- : type{ t }, value{ v }, pos{ pos }
+ Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
{
}
@@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in);
} // namespace parser
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
index 81d418c28e..c7e0100b3c 100644
--- a/src/runtime/CL/mlgo/Utils.cpp
+++ b/src/runtime/CL/mlgo/Utils.cpp
@@ -43,40 +43,38 @@ inline std::string to_str(const T &val)
std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
{
return os << "Native:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "}";
}
std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
{
return os << "ReshapedOnlyRHS:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "h0: " << config.h0 << ", "
- << "interleave_rhs: " << config.interleave_rhs << ", "
- << "transpose_rhs: " << config.transpose_rhs << ", "
- << "export_cl_image: " << config.export_cl_image
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "h0: " << config.h0 << ", "
+ << "interleave_rhs: " << config.interleave_rhs << ", "
+ << "transpose_rhs: " << config.transpose_rhs << ", "
+ << "export_cl_image: " << config.export_cl_image << "}";
}
std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
{
return os << "Reshaped:{"
- << "m0: " << config.m0 << ", "
- << "n0: " << config.n0 << ", "
- << "k0: " << config.k0 << ", "
- << "v0: " << config.v0 << ", "
- << "h0: " << config.h0 << ", "
- << "interleave_lhs: " << config.interleave_lhs << ", "
- << "interleave_rhs: " << config.interleave_rhs << ", "
- << "transpose_rhs: " << config.transpose_rhs << ", "
- << "export_cl_image: " << config.export_cl_image
- << "}";
+ << "m0: " << config.m0 << ", "
+ << "n0: " << config.n0 << ", "
+ << "k0: " << config.k0 << ", "
+ << "v0: " << config.v0 << ", "
+ << "h0: " << config.h0 << ", "
+ << "interleave_lhs: " << config.interleave_lhs << ", "
+ << "interleave_rhs: " << config.interleave_rhs << ", "
+ << "transpose_rhs: " << config.transpose_rhs << ", "
+ << "export_cl_image: " << config.export_cl_image << "}";
}
std::ostream &operator<<(std::ostream &os, HeuristicType ht)
{
- switch(ht)
+ switch (ht)
{
case HeuristicType::GEMM_Type:
{
@@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht)
}
std::ostream &operator<<(std::ostream &os, DataType dt)
{
- switch(dt)
+ switch (dt)
{
case DataType::F32:
{
@@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos)
} // namespace mlgo
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
index c634a887e9..73b537f476 100644
--- a/src/runtime/CL/mlgo/Utils.h
+++ b/src/runtime/CL/mlgo/Utils.h
@@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht);
std::ostream &operator<<(std::ostream &os, DataType dt);
std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
std::ostream &operator<<(std::ostream &os, const Query &query);
-std::string to_string(const GEMMConfigNative &config);
-std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
-std::string to_string(const GEMMConfigReshaped &config);
-std::string to_string(const Query &query);
+std::string to_string(const GEMMConfigNative &config);
+std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string to_string(const GEMMConfigReshaped &config);
+std::string to_string(const Query &query);
namespace parser
{
std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
@@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
} // namespace mlgo
} // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_UTILS_H \ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6cb2212794..5e3907f1ea 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,20 +27,20 @@ namespace arm_compute
{
namespace cl_tuner
{
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
-/** Non instantiable base class for Tuning parameters combinations that use Index2Cooard mapping */
+/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
class CLTuningParametersList : public ICLTuningParametersList
{
protected:
/* Shape of 4-D search space */
- TensorShape search_space_shape{ 0, 0, 0, 0 };
- std::vector<unsigned int> _lws_x{ 0 };
- std::vector<unsigned int> _lws_y{ 0 };
- std::vector<unsigned int> _lws_z{ 0 };
- std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+ TensorShape search_space_shape{0, 0, 0, 0};
+ std::vector<unsigned int> _lws_x{0};
+ std::vector<unsigned int> _lws_y{0};
+ std::vector<unsigned int> _lws_z{0};
+ std::vector<int> _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
The value is in the range [-31,+31].
When 0, the runtime-selected wbs used is unmodified. */
@@ -116,7 +116,8 @@ private:
* @param[in] lws_max Max LWS value allowed to be tested
* @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
*/
- void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+ void
+ initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
};
/** A minimal subset of LWS values that only have 1,2 and 4/8 */
@@ -162,14 +163,17 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index)
CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- ARM_COMPUTE_UNUSED(gws);
- search_space_shape[0] = max_lws_supported_x;
- search_space_shape[1] = max_lws_supported_y;
- search_space_shape[2] = max_lws_supported_z;
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+ search_space_shape[0] = lws_x_max;
+ search_space_shape[1] = lws_y_max;
+ search_space_shape[2] = lws_z_max;
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -3, -2, -1, 0, 1, 2, 3 };
+ _wbsm = {-3, -2, -1, 0, 1, 2, 3};
search_space_shape[3] = _wbsm.size();
}
}
@@ -183,34 +187,39 @@ CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
- auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
- auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
// Initialize the tuning parameters values to test
_lws_x = {};
_lws_y = {};
_lws_z = {};
- initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
- initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_x, gws[0], lws_x_max,
+ gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_y, gws[1], lws_y_max,
+ gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
search_space_shape[0] = _lws_x.size();
search_space_shape[1] = _lws_y.size();
search_space_shape[2] = _lws_z.size();
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -2, -1, 0, 1, 2 };
+ _wbsm = {-2, -1, 0, 1, 2};
search_space_shape[3] = _wbsm.size();
}
}
-void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+ unsigned int gws,
+ unsigned int lws_max,
+ bool mod_let_one)
{
lws.push_back(1);
- for(unsigned int i = 2; i <= lws_max; ++i)
+ for (unsigned int i = 2; i <= lws_max; ++i)
{
// Power of two condition
const bool is_power_of_two = (i & (i - 1)) == 0;
@@ -218,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
// Condition for the module accordingly with the mod_let_one flag
const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
- if(mod_cond || is_power_of_two)
+ if (mod_cond || is_power_of_two)
{
lws.push_back(i);
}
@@ -227,9 +236,9 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
- auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
- auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
// Initialize the LWS values to test
_lws_x = {};
@@ -243,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws,
search_space_shape[1] = _lws_y.size();
search_space_shape[2] = _lws_z.size();
search_space_shape[3] = 1;
- if(tuning_info.tune_wbsm)
+ if (tuning_info.tune_wbsm)
{
- _wbsm = { -1, 0, 1 };
+ _wbsm = {-1, 0, 1};
search_space_shape[3] = _wbsm.size();
}
}
@@ -254,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
{
lws.push_back(1);
- for(unsigned int i = 2; i <= lws_max; i *= 4)
+ for (unsigned int i = 2; i <= lws_max; i *= 4)
{
lws.push_back(i);
}
@@ -262,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
{
- switch(tuning_info.tuner_mode)
+ switch (tuning_info.tuner_mode)
{
case CLTunerMode::EXHAUSTIVE:
return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index f112d456c7..9fbdc3a4dd 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@
#include "arm_compute/core/Log.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/Utility.h"
+
#include "support/Mutex.h"
#include <atomic>
@@ -53,8 +54,7 @@ public:
* @param[in] start First value that will be returned by the feeder
* @param[in] end End condition (The last value returned by get_next() will be end - 1)
*/
- explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
- : _atomic_counter(start), _end(end)
+ explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end)
{
}
/** Return the next element in the range if there is one.
@@ -89,8 +89,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
{
ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
workloads[workload_index](info);
- }
- while(feeder.get_next(workload_index));
+ } while (feeder.get_next(workload_index));
}
/** Set thread affinity. Pin current thread to a particular core
@@ -99,17 +98,17 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
*/
void set_thread_affinity(int core_id)
{
- if(core_id < 0)
+ if (core_id < 0)
{
return;
}
-#if !defined(__APPLE__)
+#if !defined(_WIN64) && !defined(__APPLE__) && !defined(__OpenBSD__)
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(core_id, &set);
ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set), "Error setting thread affinity");
-#endif /* !defined(__APPLE__) */
+#endif /* !defined(__APPLE__) && !defined(__OpenBSD__) */
}
/** There are currently 2 scheduling modes supported by CPPScheduler
@@ -150,10 +149,10 @@ public:
*/
explicit Thread(int core_pin = -1);
- Thread(const Thread &) = delete;
+ Thread(const Thread &) = delete;
Thread &operator=(const Thread &) = delete;
Thread(Thread &&) = delete;
- Thread &operator=(Thread &&) = delete;
+ Thread &operator=(Thread &&) = delete;
/** Destructor. Make the thread join. */
~Thread();
@@ -172,7 +171,7 @@ public:
void start();
/** Wait for the current kernel execution to complete. */
- void wait();
+ std::exception_ptr wait();
/** Function ran by the worker thread. */
void worker_thread();
@@ -196,21 +195,20 @@ public:
private:
std::thread _thread{};
ThreadInfo _info{};
- std::vector<IScheduler::Workload> *_workloads{ nullptr };
- ThreadFeeder *_feeder{ nullptr };
+ std::vector<IScheduler::Workload> *_workloads{nullptr};
+ ThreadFeeder *_feeder{nullptr};
std::mutex _m{};
std::condition_variable _cv{};
- bool _wait_for_work{ false };
- bool _job_complete{ true };
- std::exception_ptr _current_exception{ nullptr };
- int _core_pin{ -1 };
- std::list<Thread> *_thread_pool{ nullptr };
- unsigned int _wake_beg{ 0 };
- unsigned int _wake_end{ 0 };
+ bool _wait_for_work{false};
+ bool _job_complete{true};
+ std::exception_ptr _current_exception{nullptr};
+ int _core_pin{-1};
+ std::list<Thread> *_thread_pool{nullptr};
+ unsigned int _wake_beg{0};
+ unsigned int _wake_end{0};
};
-Thread::Thread(int core_pin)
- : _core_pin(core_pin)
+Thread::Thread(int core_pin) : _core_pin(core_pin)
{
_thread = std::thread(&Thread::worker_thread, this);
}
@@ -218,7 +216,7 @@ Thread::Thread(int core_pin)
Thread::~Thread()
{
// Make sure worker thread has ended
- if(_thread.joinable())
+ if (_thread.joinable())
{
ThreadFeeder feeder;
set_workload(nullptr, feeder, ThreadInfo());
@@ -244,24 +242,20 @@ void Thread::start()
_cv.notify_one();
}
-void Thread::wait()
+std::exception_ptr Thread::wait()
{
{
std::unique_lock<std::mutex> lock(_m);
_cv.wait(lock, [&] { return _job_complete; });
}
-
- if(_current_exception)
- {
- std::rethrow_exception(_current_exception);
- }
+ return _current_exception;
}
void Thread::worker_thread()
{
set_thread_affinity(_core_pin);
- while(true)
+ while (true)
{
std::unique_lock<std::mutex> lock(_m);
_cv.wait(lock, [&] { return _wait_for_work; });
@@ -270,18 +264,18 @@ void Thread::worker_thread()
_current_exception = nullptr;
// Exit if the worker thread has not been fed with workloads
- if(_workloads == nullptr || _feeder == nullptr)
+ if (_workloads == nullptr || _feeder == nullptr)
{
return;
}
// Wake up more peer threads from thread pool if this job has been delegated to the current thread
- if(_thread_pool != nullptr)
+ if (_thread_pool != nullptr)
{
auto thread_it = _thread_pool->begin();
std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg));
auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1));
- for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
+ for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
{
thread_it->start();
}
@@ -295,7 +289,7 @@ void Thread::worker_thread()
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
- catch(...)
+ catch (...)
{
_current_exception = std::current_exception();
}
@@ -326,11 +320,11 @@ struct CPPScheduler::Impl final
: _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U)
{
const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE"));
- if(mode_env_v == "linear")
+ if (mode_env_v == "linear")
{
_forced_mode = ModeToggle::Linear;
}
- else if(mode_env_v == "fanout")
+ else if (mode_env_v == "fanout")
{
_forced_mode = ModeToggle::Fanout;
}
@@ -354,7 +348,7 @@ struct CPPScheduler::Impl final
// Set affinity on worked threads
_threads.clear();
- for(auto i = 1U; i < _num_threads; ++i)
+ for (auto i = 1U; i < _num_threads; ++i)
{
_threads.emplace_back(func(i, thread_hint));
}
@@ -363,20 +357,23 @@ struct CPPScheduler::Impl final
void auto_switch_mode(unsigned int num_threads_to_use)
{
// If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use
- if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
+ if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
{
set_fanout_mode(m_default_wake_fanout, num_threads_to_use);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+ "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n",
+ this->wake_fanout(), num_threads_to_use);
}
else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8))
{
set_linear_mode();
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n",
+ num_threads_to_use);
}
}
void set_linear_mode()
{
- for(auto &thread : _threads)
+ for (auto &thread : _threads)
{
thread.set_linear_mode();
}
@@ -388,14 +385,14 @@ struct CPPScheduler::Impl final
ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1);
const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1));
auto thread_it = _threads.begin();
- for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
+ for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
{
const auto wake_begin = i * actual_wake_fanout - 1;
const auto wake_end = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1);
thread_it->set_fanout_mode(&_threads, wake_begin, wake_end);
}
// Reset the remaining threads's wake up schedule
- while(thread_it != _threads.end())
+ while (thread_it != _threads.end())
{
thread_it->set_fanout_mode(&_threads, 0U, 0U);
++thread_it;
@@ -421,9 +418,9 @@ struct CPPScheduler::Impl final
unsigned int _num_threads;
std::list<Thread> _threads;
arm_compute::Mutex _run_workloads_mutex{};
- Mode _mode{ Mode::Linear };
- ModeToggle _forced_mode{ ModeToggle::None };
- unsigned int _wake_fanout{ 0 };
+ Mode _mode{Mode::Linear};
+ ModeToggle _forced_mode{ModeToggle::None};
+ unsigned int _wake_fanout{0};
};
/*
@@ -435,8 +432,7 @@ CPPScheduler &CPPScheduler::get()
return scheduler;
}
-CPPScheduler::CPPScheduler()
- : _impl(std::make_unique<Impl>(num_threads_hint()))
+CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint()))
{
}
@@ -469,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
// This is not great because different threads workloads won't run in parallel but at least they
// won't interfere each other and deadlock.
arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
- const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
- if(num_threads_to_use < 1)
+ const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
+ if (num_threads_to_use < 1)
{
return;
}
// Re-adjust the mode if the actual number of threads to use is different from the number of threads created
_impl->auto_switch_mode(num_threads_to_use);
int num_threads_to_start = 0;
- switch(_impl->mode())
+ switch (_impl->mode())
{
case CPPScheduler::Impl::Mode::Fanout:
{
@@ -493,35 +489,54 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
}
ThreadFeeder feeder(num_threads_to_use, workloads.size());
ThreadInfo info;
- info.cpu_info = &_cpu_info;
+ info.cpu_info = &cpu_info();
info.num_threads = num_threads_to_use;
unsigned int t = 0;
auto thread_it = _impl->_threads.begin();
// Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread
- for(; t < num_threads_to_use - 1; ++t, ++thread_it)
+ for (; t < num_threads_to_use - 1; ++t, ++thread_it)
{
info.thread_id = t;
thread_it->set_workload(&workloads, feeder, info);
}
thread_it = _impl->_threads.begin();
- for(int i = 0; i < num_threads_to_start; ++i, ++thread_it)
+ for (int i = 0; i < num_threads_to_start; ++i, ++thread_it)
{
thread_it->start();
}
- info.thread_id = t; // Set main thread's thread_id
- process_workloads(workloads, feeder, info); // Main thread processes workloads
+ info.thread_id = t; // Set main thread's thread_id
+ std::exception_ptr last_exception = nullptr;
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ try
+ {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+ process_workloads(workloads, feeder, info); // Main thread processes workloads
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ }
+ catch (...)
+ {
+ last_exception = std::current_exception();
+ }
+
try
{
#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
thread_it = _impl->_threads.begin();
- for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
+ for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
+ {
+ std::exception_ptr current_exception = thread_it->wait();
+ if (current_exception)
+ {
+ last_exception = current_exception;
+ }
+ }
+ if (last_exception)
{
- thread_it->wait();
+ std::rethrow_exception(last_exception);
}
#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
- catch(const std::system_error &e)
+ catch (const std::system_error &e)
{
std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 70536b7ccc..c46a2731d8 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -39,33 +39,36 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
{
const Window &max_window = kernel->window();
- if(hints.split_dimension() != IScheduler::split_dimensions_all)
+ if (hints.split_dimension() != IScheduler::split_dimensions_all)
{
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
- if(num_iterations < 1)
+ if (num_iterations < 1)
{
return;
}
}
ThreadInfo info;
- info.cpu_info = &_cpu_info;
+ info.cpu_info = &cpu_info();
kernel->run(kernel->window(), info);
}
-void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+void SingleThreadScheduler::schedule_op(ICPPKernel *kernel,
+ const Hints &hints,
+ const Window &window,
+ ITensorPack &tensors)
{
ARM_COMPUTE_UNUSED(hints);
ThreadInfo info;
- info.cpu_info = &_cpu_info;
+ info.cpu_info = &cpu_info();
kernel->run_op(tensors, window, info);
}
void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
{
ThreadInfo info;
- info.cpu_info = &_cpu_info;
- for(auto &wl : workloads)
+ info.cpu_info = &cpu_info();
+ for (auto &wl : workloads)
{
wl(info);
}
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index b6803d0d37..94a1673d59 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
#include "arm_compute/runtime/Scheduler.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
namespace
@@ -40,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output)
Iterator input_it(input, window);
Iterator output_it(output, window);
- switch(data_type)
+ switch (data_type)
{
case DataType::QASYMM8:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<float *>(output_it.ptr()) =
+ dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+ },
+ input_it, output_it);
break;
case DataType::QASYMM8_SIGNED:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<float *>(output_it.ptr()) =
+ dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
break;
case DataType::QASYMM16:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<float *>(output_it.ptr()) =
+ dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+ },
+ input_it, output_it);
break;
default:
ARM_COMPUTE_ERROR("Unsupported data type");
@@ -78,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output)
Iterator input_it(input, window);
Iterator output_it(output, window);
- switch(data_type)
+ switch (data_type)
{
case DataType::QASYMM8:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<uint8_t *>(output_it.ptr()) =
+ quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
break;
case DataType::QASYMM8_SIGNED:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<int8_t *>(output_it.ptr()) =
+ quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
break;
case DataType::QASYMM16:
- execute_window_loop(window, [&](const Coordinates &)
- {
- *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
- },
- input_it, output_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ *reinterpret_cast<uint16_t *>(output_it.ptr()) =
+ quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
break;
default:
ARM_COMPUTE_ERROR("Unsupported data type");
@@ -130,12 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh
{
}
-void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
- ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in,
+ const ITensor *boxes_in,
+ const ITensor *batch_splits_in,
+ ITensor *scores_out,
+ ITensor *boxes_out,
+ ITensor *classes,
+ ITensor *batch_splits_out,
+ ITensor *keeps,
+ ITensor *keeps_size,
+ const BoxNMSLimitInfo info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+ ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out,
+ keeps, keeps_size, info);
- _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
+ _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 ||
+ scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
_scores_in = scores_in;
_boxes_in = boxes_in;
@@ -146,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
_batch_splits_out = batch_splits_out;
_keeps = keeps;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
// Manage intermediate buffers
_memory_group.manage(&_scores_in_f32);
@@ -156,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
_memory_group.manage(&_classes_f32);
_scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
_boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
- if(batch_splits_in != nullptr)
+ if (batch_splits_in != nullptr)
{
_memory_group.manage(&_batch_splits_in_f32);
_batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
@@ -164,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
_scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
_boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
_classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
- if(batch_splits_out != nullptr)
+ if (batch_splits_out != nullptr)
{
_memory_group.manage(&_batch_splits_out_f32);
_batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
}
- if(keeps != nullptr)
+ if (keeps != nullptr)
{
_memory_group.manage(&_keeps_f32);
_keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
}
- _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+ _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32,
+ (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
&_scores_out_f32, &_boxes_out_f32, &_classes_f32,
- (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
- keeps_size, info);
+ (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr,
+ (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info);
}
else
{
- _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+ _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes,
+ batch_splits_out, keeps, keeps_size, info);
}
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_scores_in_f32.allocator()->allocate();
_boxes_in_f32.allocator()->allocate();
- if(_batch_splits_in != nullptr)
+ if (_batch_splits_in != nullptr)
{
_batch_splits_in_f32.allocator()->allocate();
}
_scores_out_f32.allocator()->allocate();
_boxes_out_f32.allocator()->allocate();
_classes_f32.allocator()->allocate();
- if(batch_splits_out != nullptr)
+ if (batch_splits_out != nullptr)
{
_batch_splits_out_f32.allocator()->allocate();
}
- if(keeps != nullptr)
+ if (keeps != nullptr)
{
_keeps_f32.allocator()->allocate();
}
}
}
-Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
- const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+Status validate(const ITensorInfo *scores_in,
+ const ITensorInfo *boxes_in,
+ const ITensorInfo *batch_splits_in,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *boxes_out,
+ const ITensorInfo *classes,
+ const ITensorInfo *batch_splits_out,
+ const ITensorInfo *keeps,
+ const ITensorInfo *keeps_size,
+ const BoxNMSLimitInfo info)
{
ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
- const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
- if(is_qasymm8)
+ const bool is_qasymm8 =
+ scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
@@ -233,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
// Acquire all the temporaries
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
dequantize_tensor(_scores_in, &_scores_in_f32);
dequantize_tensor(_boxes_in, &_boxes_in_f32);
- if(_batch_splits_in != nullptr)
+ if (_batch_splits_in != nullptr)
{
dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
}
@@ -245,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
quantize_tensor(&_scores_out_f32, _scores_out);
quantize_tensor(&_boxes_out_f32, _boxes_out);
quantize_tensor(&_classes_f32, _classes);
- if(_batch_splits_out != nullptr)
+ if (_batch_splits_out != nullptr)
{
quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
}
- if(_keeps != nullptr)
+ if (_keeps != nullptr)
{
quantize_tensor(&_keeps_f32, _keeps);
}
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index fdb4c9f0f6..e6291f973e 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <list>
@@ -34,25 +36,35 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo *input_loc,
+ const ITensorInfo *input_conf,
+ const ITensorInfo *input_priorbox,
+ const ITensorInfo *output,
+ DetectionOutputLayerInfo info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3,
+ "The priorbox input tensor should be [C3, 2, N].");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
const int num_priors = input_priorbox->tensor_shape()[0] / 4;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) !=
+ input_loc->tensor_shape()[0],
+ "Number of priors must match number of location predictions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) !=
+ input_conf->tensor_shape()[0],
+ "Number of priors must match number of confidence predictions.");
// Validate configured output
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+ const unsigned int max_size =
+ info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
}
@@ -63,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input
/** Function used to sort pair<float, T> in descend order based on the score (first) value.
*/
template <typename T>
-bool SortScorePairDescend(const std::pair<float, T> &pair1,
- const std::pair<float, T> &pair2)
+bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2)
{
return pair1.first > pair2.first;
}
@@ -80,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1,
* @param[out] all_location_predictions All the location predictions.
*
*/
-void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
- const int num_priors, const int num_loc_classes,
- const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+void retrieve_all_loc_predictions(const ITensor *input_loc,
+ const int num,
+ const int num_priors,
+ const int num_loc_classes,
+ const bool share_location,
+ std::vector<LabelBBox> &all_location_predictions)
{
- for(int i = 0; i < num; ++i)
+ for (int i = 0; i < num; ++i)
{
- for(int c = 0; c < num_loc_classes; ++c)
+ for (int c = 0; c < num_loc_classes; ++c)
{
int label = share_location ? -1 : c;
- if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+ if (all_location_predictions[i].find(label) == all_location_predictions[i].end())
{
all_location_predictions[i][label].resize(num_priors);
}
@@ -100,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
}
}
}
- for(int i = 0; i < num; ++i)
+ for (int i = 0; i < num; ++i)
{
- for(int p = 0; p < num_priors; ++p)
+ for (int p = 0; p < num_priors; ++p)
{
- for(int c = 0; c < num_loc_classes; ++c)
+ for (int c = 0; c < num_loc_classes; ++c)
{
const int label = share_location ? -1 : c;
const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
//xmin, ymin, xmax, ymax
- all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
- all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
- all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
- all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+ all_location_predictions[i][label][p][0] =
+ *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+ all_location_predictions[i][label][p][1] =
+ *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+ all_location_predictions[i][label][p][2] =
+ *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+ all_location_predictions[i][label][p][3] =
+ *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
}
}
}
@@ -128,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
* @param[out] all_location_predictions All the location predictions.
*
*/
-void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
- const int num_priors, const int num_classes,
+void retrieve_all_conf_scores(const ITensor *input_conf,
+ const int num,
+ const int num_priors,
+ const int num_classes,
std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
{
std::vector<float> tmp_buffer;
tmp_buffer.resize(num * num_priors * num_classes);
- for(int i = 0; i < num; ++i)
+ for (int i = 0; i < num; ++i)
{
- for(int c = 0; c < num_classes; ++c)
+ for (int c = 0; c < num_classes; ++c)
{
- for(int p = 0; p < num_priors; ++p)
+ for (int p = 0; p < num_priors; ++p)
{
- tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
- *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+ tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>(
+ input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
}
}
}
- for(int i = 0; i < num; ++i)
+ for (int i = 0; i < num; ++i)
{
- for(int c = 0; c < num_classes; ++c)
+ for (int c = 0; c < num_classes; ++c)
{
all_confidence_scores[i][c].resize(num_priors);
all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
@@ -166,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
* @param[out] all_location_predictions All the location predictions.
*
*/
-void retrieve_all_priorbox(const ITensor *input_priorbox,
- const int num_priors,
- std::vector<BBox> &all_prior_bboxes,
+void retrieve_all_priorbox(const ITensor *input_priorbox,
+ const int num_priors,
+ std::vector<BBox> &all_prior_bboxes,
std::vector<std::array<float, 4>> &all_prior_variances)
{
- for(int i = 0; i < num_priors; ++i)
+ for (int i = 0; i < num_priors; ++i)
{
- all_prior_bboxes[i] =
- {
- {
- *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
- *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
- *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
- *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
- }
- };
+ all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}};
}
- std::array<float, 4> var({ { 0, 0, 0, 0 } });
- for(int i = 0; i < num_priors; ++i)
+ std::array<float, 4> var({{0, 0, 0, 0}});
+ for (int i = 0; i < num_priors; ++i)
{
- for(int j = 0; j < 4; ++j)
+ for (int j = 0; j < 4; ++j)
{
var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
}
@@ -206,13 +221,17 @@ void retrieve_all_priorbox(const ITensor *input_priorbox,
* @param[out] decode_bbox The decoded bboxes.
*
*/
-void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance,
- const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
- const bool clip_bbox, const BBox &bbox, BBox &decode_bbox)
+void DecodeBBox(const BBox &prior_bbox,
+ const std::array<float, 4> &prior_variance,
+ const DetectionOutputLayerCodeType code_type,
+ const bool variance_encoded_in_target,
+ const bool clip_bbox,
+ const BBox &bbox,
+ BBox &decode_bbox)
{
// if the variance is encoded in target, we simply need to add the offset predictions
// otherwise we need to scale the offset accordingly.
- switch(code_type)
+ switch (code_type)
{
case DetectionOutputLayerCodeType::CORNER:
{
@@ -235,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
- const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
- const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
- const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
- const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+ const float decode_bbox_center_x =
+ (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+ const float decode_bbox_center_y =
+ (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+ const float decode_bbox_width =
+ (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+ const float decode_bbox_height =
+ (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
@@ -256,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
- decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
- decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
- decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
- decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+ decode_bbox[0] =
+ prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+ decode_bbox[1] =
+ prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+ decode_bbox[2] =
+ prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+ decode_bbox[3] =
+ prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
break;
}
@@ -267,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
}
- if(clip_bbox)
+ if (clip_bbox)
{
- for(auto &d_bbox : decode_bbox)
+ for (auto &d_bbox : decode_bbox)
{
d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
}
@@ -287,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
* @param[out] indices The kept indices of bboxes after nms.
*
*/
-void ApplyNMSFast(const std::vector<BBox> &bboxes,
- const std::vector<float> &scores, const float score_threshold,
- const float nms_threshold, const float eta, const int top_k,
- std::vector<int> &indices)
+void ApplyNMSFast(const std::vector<BBox> &bboxes,
+ const std::vector<float> &scores,
+ const float score_threshold,
+ const float nms_threshold,
+ const float eta,
+ const int top_k,
+ std::vector<int> &indices)
{
ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
@@ -298,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
std::list<std::pair<float, int>> score_index_vec;
// Generate index score pairs.
- for(size_t i = 0; i < scores.size(); ++i)
+ for (size_t i = 0; i < scores.size(); ++i)
{
- if(scores[i] > score_threshold)
+ if (scores[i] > score_threshold)
{
score_index_vec.emplace_back(std::make_pair(scores[i], i));
}
@@ -311,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
// Keep top_k scores if needed.
const int score_index_vec_size = score_index_vec.size();
- if(top_k > -1 && top_k < score_index_vec_size)
+ if (top_k > -1 && top_k < score_index_vec_size)
{
score_index_vec.resize(top_k);
}
@@ -320,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
float adaptive_threshold = nms_threshold;
indices.clear();
- while(!score_index_vec.empty())
+ while (!score_index_vec.empty())
{
const int idx = score_index_vec.front().second;
bool keep = true;
- for(int kept_idx : indices)
+ for (int kept_idx : indices)
{
- if(keep)
+ if (keep)
{
// Compute the jaccard (intersection over union IoU) overlap between two bboxes.
- BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 });
- if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+ BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0});
+ if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] ||
+ bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
{
- intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+ intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}});
}
else
{
- intersect_bbox = std::array<float, 4>({ {
- std::max(bboxes[idx][0], bboxes[kept_idx][0]),
- std::max(bboxes[idx][1], bboxes[kept_idx][1]),
- std::min(bboxes[idx][2], bboxes[kept_idx][2]),
- std::min(bboxes[idx][3], bboxes[kept_idx][3])
- }
- });
+ intersect_bbox = std::array<float, 4>(
+ {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+ std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+ std::min(bboxes[idx][3], bboxes[kept_idx][3])}});
}
float intersect_width = intersect_bbox[2] - intersect_bbox[0];
float intersect_height = intersect_bbox[3] - intersect_bbox[1];
float overlap = 0.f;
- if(intersect_width > 0 && intersect_height > 0)
+ if (intersect_width > 0 && intersect_height > 0)
{
float intersect_size = intersect_width * intersect_height;
- float bbox1_size = (bboxes[idx][2] < bboxes[idx][0]
- || bboxes[idx][3] < bboxes[idx][1]) ?
- 0.f :
- (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
- float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
- || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
- 0.f :
- (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+ float bbox1_size = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1])
+ ? 0.f
+ : (bboxes[idx][2] - bboxes[idx][0]) *
+ (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+ float bbox2_size =
+ (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1])
+ ? 0.f
+ : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) *
+ (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
}
keep = (overlap <= adaptive_threshold);
@@ -369,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
break;
}
}
- if(keep)
+ if (keep)
{
indices.push_back(idx);
}
score_index_vec.erase(score_index_vec.begin());
- if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+ if (keep && eta < 1.f && adaptive_threshold > 0.5f)
{
adaptive_threshold *= eta;
}
@@ -383,23 +412,42 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
} // namespace
CPPDetectionOutputLayer::CPPDetectionOutputLayer()
- : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
- _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+ : _input_loc(nullptr),
+ _input_conf(nullptr),
+ _input_priorbox(nullptr),
+ _output(nullptr),
+ _info(),
+ _num_priors(),
+ _num(),
+ _all_location_predictions(),
+ _all_confidence_scores(),
+ _all_prior_bboxes(),
+ _all_prior_variances(),
+ _all_decode_bboxes(),
+ _all_indices()
{
}
-void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+void CPPDetectionOutputLayer::configure(const ITensor *input_loc,
+ const ITensor *input_conf,
+ const ITensor *input_priorbox,
+ ITensor *output,
+ DetectionOutputLayerInfo info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+ ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info);
+
// Output auto initialization if not yet initialized
// Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
// The maximum is keep_top_k * input_loc_size[1]
// Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
- const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+ const unsigned int max_size =
+ info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
_input_loc = input_loc;
_input_conf = input_conf;
@@ -415,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
_all_prior_variances.resize(_num_priors);
_all_decode_bboxes.resize(_num);
- for(int i = 0; i < _num; ++i)
+ for (int i = 0; i < _num; ++i)
{
- for(int c = 0; c < _info.num_loc_classes(); ++c)
+ for (int c = 0; c < _info.num_loc_classes(); ++c)
{
const int label = _info.share_location() ? -1 : c;
- if(label == _info.background_label_id())
+ if (label == _info.background_label_id())
{
// Ignore background class.
continue;
@@ -435,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
}
-Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc,
+ const ITensorInfo *input_conf,
+ const ITensorInfo *input_priorbox,
+ const ITensorInfo *output,
+ DetectionOutputLayerInfo info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
return Status{};
@@ -444,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe
void CPPDetectionOutputLayer::run()
{
// Retrieve all location predictions.
- retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+ retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(),
+ _all_location_predictions);
// Retrieve all confidences.
retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
@@ -454,75 +507,79 @@ void CPPDetectionOutputLayer::run()
// Decode all loc predictions to bboxes
const bool clip_bbox = false;
- for(int i = 0; i < _num; ++i)
+ for (int i = 0; i < _num; ++i)
{
- for(int c = 0; c < _info.num_loc_classes(); ++c)
+ for (int c = 0; c < _info.num_loc_classes(); ++c)
{
const int label = _info.share_location() ? -1 : c;
- if(label == _info.background_label_id())
+ if (label == _info.background_label_id())
{
// Ignore background class.
continue;
}
- ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+ ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(),
+ "Could not find location predictions for label %d.", label);
const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
const int num_bboxes = _all_prior_bboxes.size();
ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
- for(int j = 0; j < num_bboxes; ++j)
+ for (int j = 0; j < num_bboxes; ++j)
{
- DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+ DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(),
+ _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j],
+ _all_decode_bboxes[i][label][j]);
}
}
}
int num_kept = 0;
- for(int i = 0; i < _num; ++i)
+ for (int i = 0; i < _num; ++i)
{
- const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
- const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
std::map<int, std::vector<int>> indices;
- int num_det = 0;
- for(int c = 0; c < _info.num_classes(); ++c)
+ int num_det = 0;
+ for (int c = 0; c < _info.num_classes(); ++c)
{
- if(c == _info.background_label_id())
+ if (c == _info.background_label_id())
{
// Ignore background class
continue;
}
const int label = _info.share_location() ? -1 : c;
- if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+ if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
{
ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
}
const std::vector<float> &scores = conf_scores.find(c)->second;
- const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
+ const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
- ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+ ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(),
+ _info.top_k(), indices[c]);
num_det += indices[c].size();
}
int num_to_add = 0;
- if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+ if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
{
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
- for(auto const &it : indices)
+ for (auto const &it : indices)
{
const int label = it.first;
const std::vector<int> &label_indices = it.second;
- if(conf_scores.find(label) == conf_scores.end())
+ if (conf_scores.find(label) == conf_scores.end())
{
ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
}
const std::vector<float> &scores = conf_scores.find(label)->second;
- for(auto idx : label_indices)
+ for (auto idx : label_indices)
{
ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
@@ -536,7 +593,7 @@ void CPPDetectionOutputLayer::run()
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
- for(auto score_index_pair : score_index_pairs)
+ for (auto score_index_pair : score_index_pairs)
{
int label = score_index_pair.second.first;
int idx = score_index_pair.second.second;
@@ -557,25 +614,25 @@ void CPPDetectionOutputLayer::run()
_output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
int count = 0;
- for(int i = 0; i < _num; ++i)
+ for (int i = 0; i < _num; ++i)
{
- const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
- const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
- for(auto &it : _all_indices[i])
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ for (auto &it : _all_indices[i])
{
const int label = it.first;
const std::vector<float> &scores = conf_scores.find(label)->second;
const int loc_label = _info.share_location() ? -1 : label;
- if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+ if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
{
// Either if there are no confidence predictions
// or there are no location predictions for current label.
ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
}
const std::vector<BBox> &bboxes = decode_bboxes.find(loc_label)->second;
- const std::vector<int> &indices = it.second;
+ const std::vector<int> &indices = it.second;
- for(auto idx : indices)
+ for (auto idx : indices)
{
*(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7)))) = i;
*(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 31f1fafd69..2861d6cacb 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <cstddef>
@@ -36,53 +38,76 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
- DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
+Status validate_arguments(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_class_score,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
+ DetectionPostProcessLayerInfo info,
+ const unsigned int kBatchSize,
+ const unsigned int kNumCoordBox)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
- if(input_box_encoding->num_dimensions() > 2)
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3,
+ "The location input tensor shape should be [4, N, kBatchSize].");
+ if (input_box_encoding->num_dimensions() > 2)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(
+ input_box_encoding->dimension(2) != kBatchSize,
+ "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
- "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
- if(input_anchors->num_dimensions() > 2)
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox,
+ "The first dimension of the input box_encoding tensor should be equal to %d.",
+ kNumCoordBox);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input_class_score->dimension(0) != (info.num_classes() + 1),
+ "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3,
+ "The anchors input tensor shape should be [4, N, kBatchSize].");
+ if (input_anchors->num_dimensions() > 2)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox,
+ "The first dimension of the input anchors tensor should be equal to %d.",
+ kNumCoordBox);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
- || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) ||
+ (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
"The second dimension of the inputs should be the same.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1,
+ "The num_detection output tensor shape should be [M].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f),
+ "The intersection over union should be positive and less than 1.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0,
+ "The number of max classes per detection should be positive.");
const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
// Validate configured outputs
- if(output_boxes->total_size() != 0)
+ if (output_boxes->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(),
+ TensorShape(4U, num_detected_boxes, 1U));
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
}
- if(output_classes->total_size() != 0)
+ if (output_classes->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(),
+ TensorShape(num_detected_boxes, 1U));
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
}
- if(output_scores->total_size() != 0)
+ if (output_scores->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(),
+ TensorShape(num_detected_boxes, 1U));
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
}
- if(num_detection->total_size() != 0)
+ if (num_detection->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
@@ -91,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn
return Status{};
}
-inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+inline void
+DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
{
const float half_factor = 0.5f;
// BBox is equavalent to CenterSizeEncoding [y,x,h,w]
const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
- const float half_h = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
- const float half_w = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+ const float half_h =
+ half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+ const float half_w =
+ half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
// Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
auto decoded_ptr = reinterpret_cast<float *>(decoded_it.ptr());
@@ -116,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode
* @param[in] info The detection informations
* @param[out] decoded_boxes The decoded bboxes.
*/
-void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
+void DecodeCenterSizeBoxes(const ITensor *input_box_encoding,
+ const ITensor *input_anchors,
+ DetectionPostProcessLayerInfo info,
+ Tensor *decoded_boxes)
{
const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info();
const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
- BBox box_centersize{ {} };
- BBox anchor{ {} };
+ BBox box_centersize{{}};
+ BBox anchor{{}};
Window win;
win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -131,103 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp
Iterator anchor_it(input_anchors, win);
Iterator decoded_it(decoded_boxes, win);
- if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
+ if (input_box_encoding->info()->data_type() == DataType::QASYMM8)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
- const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
- box_centersize = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
- dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
- });
- anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
- dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
- });
- DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
- },
- box_it, anchor_it, decoded_it);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
+ box_centersize =
+ BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
+ dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)});
+ anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors),
+ dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
+ dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors),
+ dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)});
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
}
- else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+ else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
- const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
- box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
- dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
- });
- anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
- dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
- });
- DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
- },
- box_it, anchor_it, decoded_it);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+ box_centersize = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box),
+ dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+ dequantize_qasymm8_signed(*(2 + box_ptr), qi_box),
+ dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)});
+ anchor = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors),
+ dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+ dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors),
+ dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)});
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr());
- const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
- box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
- anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
- DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
- },
- box_it, anchor_it, decoded_it);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
+ box_centersize = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)});
+ anchor = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)});
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
}
}
-void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
- std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
- ITensor *num_detection)
+void SaveOutputs(const Tensor *decoded_boxes,
+ const std::vector<int> &result_idx_boxes_after_nms,
+ const std::vector<float> &result_scores_after_nms,
+ const std::vector<int> &result_classes_after_nms,
+ std::vector<unsigned int> &sorted_indices,
+ const unsigned int num_output,
+ const unsigned int max_detections,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection)
{
// xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
unsigned int i = 0;
- for(; i < num_output; ++i)
+ for (; i < num_output; ++i)
{
const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
- *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
- *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
- *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
- *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
- *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
- *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = result_scores_after_nms[sorted_indices[i]];
+ *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) =
+ *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
+ *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) =
+ *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
+ *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) =
+ *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
+ *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) =
+ *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
+ *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) =
+ static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
+ *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) =
+ result_scores_after_nms[sorted_indices[i]];
}
- for(; i < max_detections; ++i)
+ for (; i < max_detections; ++i)
{
*(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
*(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
*(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
*(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
- *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
- *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f;
+ *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
+ *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) = 0.0f;
}
*(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
}
} // namespace
CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
- _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
- _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
+ : _memory_group(std::move(memory_manager)),
+ _nms(),
+ _input_box_encoding(nullptr),
+ _input_scores(nullptr),
+ _input_anchors(nullptr),
+ _output_boxes(nullptr),
+ _output_classes(nullptr),
+ _output_scores(nullptr),
+ _num_detection(nullptr),
+ _info(),
+ _num_boxes(),
+ _num_classes_with_background(),
+ _num_max_detected_boxes(),
+ _dequantize_scores(false),
+ _decoded_boxes(),
+ _decoded_scores(),
+ _selected_indices(),
+ _class_scores(),
+ _input_scores_to_use(nullptr)
{
}
-void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
- ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
+ const ITensor *input_scores,
+ const ITensor *input_anchors,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection,
+ DetectionPostProcessLayerInfo info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+ output_scores);
+ ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+ num_detection, info);
+
_num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
- auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
- auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
- auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+ auto_init_if_empty(*output_boxes->info(),
+ TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+ auto_init_if_empty(*output_classes->info(),
+ TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+ auto_init_if_empty(*output_scores->info(),
+ TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
- num_detection->info(),
- info, _kBatchSize, _kNumCoordBox));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+ output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox));
_input_box_encoding = input_box_encoding;
_input_scores = input_scores;
@@ -239,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
_info = info;
_num_boxes = input_box_encoding->info()->dimension(1);
_num_classes_with_background = _input_scores->info()->dimension(0);
- _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
-
- auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
- auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
- auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
+ _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
+
+ auto_init_if_empty(*_decoded_boxes.info(),
+ TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1,
+ DataType::F32));
+ auto_init_if_empty(
+ *_decoded_scores.info(),
+ TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize),
+ 1, DataType::F32));
+ auto_init_if_empty(
+ *_selected_indices.info(),
+ TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1,
+ DataType::S32));
const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
- auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
+ auto_init_if_empty(
+ *_class_scores.info(),
+ TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1,
+ DataType::F32));
_input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
@@ -254,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
_memory_group.manage(&_decoded_scores);
_memory_group.manage(&_selected_indices);
_memory_group.manage(&_class_scores);
- _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
+ _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices,
+ info.use_regular_nms() ? info.detection_per_class() : info.max_detections(),
+ info.nms_score_threshold(), info.iou_threshold());
// Allocate and reserve intermediate tensors and vectors
_decoded_boxes.allocator()->allocate();
@@ -263,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
_class_scores.allocator()->allocate();
}
-Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_class_score,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
+ DetectionPostProcessLayerInfo info)
{
- constexpr unsigned int kBatchSize = 1;
- constexpr unsigned int kNumCoordBox = 4;
- const TensorInfo _decoded_boxes_info = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
- const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
- const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
-
- ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
- info.iou_threshold()));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
+ constexpr unsigned int kBatchSize = 1;
+ constexpr unsigned int kNumCoordBox = 4;
+ const TensorInfo _decoded_boxes_info =
+ TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
+ const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
+ const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info,
+ &_selected_indices_info, info.max_detections(),
+ info.nms_score_threshold(), info.iou_threshold()));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes,
+ output_classes, output_scores, num_detection, info, kBatchSize,
+ kNumCoordBox));
return Status{};
}
@@ -287,62 +393,69 @@ void CPPDetectionPostProcessLayer::run()
DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
// Decode scores if necessary
- if(_dequantize_scores)
+ if (_dequantize_scores)
{
- if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
+ if (_input_box_encoding->info()->data_type() == DataType::QASYMM8)
{
- for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+ for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
{
- for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+ for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
{
*(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
- dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+ dequantize_qasymm8(
+ *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+ _input_scores->info()->quantization_info());
}
}
}
- else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+ else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
{
- for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+ for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
{
- for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+ for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
{
*(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
- dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+ dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(
+ _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+ _input_scores->info()->quantization_info());
}
}
}
}
// Regular NMS
- if(_info.use_regular_nms())
+ if (_info.use_regular_nms())
{
std::vector<int> result_idx_boxes_after_nms;
std::vector<int> result_classes_after_nms;
std::vector<float> result_scores_after_nms;
std::vector<unsigned int> sorted_indices;
- for(unsigned int c = 0; c < num_classes; ++c)
+ for (unsigned int c = 0; c < num_classes; ++c)
{
// For each boxes get scores of the boxes for the class c
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
*(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
- *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
+ *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(
+ Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
}
// Run Non-maxima Suppression
_nms.run();
- for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
+ for (unsigned int i = 0; i < _info.detection_per_class(); ++i)
{
- const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
- if(selected_index == -1)
+ const auto selected_index =
+ *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
+ if (selected_index == -1)
{
// Nms will return -1 for all the last M-elements not valid
break;
}
result_idx_boxes_after_nms.emplace_back(selected_index);
- result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+ result_scores_after_nms.emplace_back(
+ (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
result_classes_after_nms.emplace_back(c);
}
}
@@ -354,49 +467,46 @@ void CPPDetectionPostProcessLayer::run()
// Sort selected indices based on result scores
sorted_indices.resize(num_selected);
std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
- std::partial_sort(sorted_indices.data(),
- sorted_indices.data() + num_output,
+ std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output,
sorted_indices.data() + num_selected,
[&](unsigned int first, unsigned int second)
- {
-
- return result_scores_after_nms[first] > result_scores_after_nms[second];
- });
+ { return result_scores_after_nms[first] > result_scores_after_nms[second]; });
- SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
- num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+ SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms,
+ sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores,
+ _num_detection);
}
// Fast NMS
else
{
- const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
+ const unsigned int num_classes_per_box =
+ std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
std::vector<float> max_scores;
std::vector<int> box_indices;
std::vector<int> max_score_classes;
- for(unsigned int b = 0; b < _num_boxes; ++b)
+ for (unsigned int b = 0; b < _num_boxes; ++b)
{
std::vector<float> box_scores;
- for(unsigned int c = 0; c < num_classes; ++c)
+ for (unsigned int c = 0; c < num_classes; ++c)
{
- box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
+ box_scores.emplace_back(
+ *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
}
std::vector<unsigned int> max_score_indices;
max_score_indices.resize(_info.num_classes());
std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
- std::partial_sort(max_score_indices.data(),
- max_score_indices.data() + num_classes_per_box,
+ std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box,
max_score_indices.data() + num_classes,
[&](unsigned int first, unsigned int second)
- {
- return box_scores[first] > box_scores[second];
- });
+ { return box_scores[first] > box_scores[second]; });
- for(unsigned int i = 0; i < num_classes_per_box; ++i)
+ for (unsigned int i = 0; i < num_classes_per_box; ++i)
{
- const float score_to_add = box_scores[max_score_indices[i]];
- *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+ const float score_to_add = box_scores[max_score_indices[i]];
+ *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) =
+ score_to_add;
max_scores.emplace_back(score_to_add);
box_indices.emplace_back(b);
max_score_classes.emplace_back(max_score_indices[i]);
@@ -406,10 +516,10 @@ void CPPDetectionPostProcessLayer::run()
// Run Non-maxima Suppression
_nms.run();
std::vector<unsigned int> selected_indices;
- for(unsigned int i = 0; i < max_detections; ++i)
+ for (unsigned int i = 0; i < max_detections; ++i)
{
// NMS returns M valid indices, the not valid tail is filled with -1
- if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
+ if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
{
// Nms will return -1 for all the last M-elements not valid
break;
@@ -419,8 +529,8 @@ void CPPDetectionPostProcessLayer::run()
// We select the max detection numbers of the highest score of all classes
const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
- SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
- num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+ SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output,
+ max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
}
}
} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index d0d0b1e98b..3217742c6b 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,21 +25,32 @@
#include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
-void CPPNonMaximumSuppression::configure(
- const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
- const float score_threshold, const float nms_threshold)
+void CPPNonMaximumSuppression::configure(const ITensor *bboxes,
+ const ITensor *scores,
+ ITensor *indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float nms_threshold)
{
+ ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+
auto k = std::make_unique<CPPNonMaximumSuppressionKernel>();
k->configure(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
_kernel = std::move(k);
}
-Status CPPNonMaximumSuppression::validate(
- const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
- const float score_threshold, const float nms_threshold)
+Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes,
+ const ITensorInfo *scores,
+ const ITensorInfo *indices,
+ unsigned int max_output_size,
+ const float score_threshold,
+ const float nms_threshold)
{
- return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+ return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold,
+ nms_threshold);
}
} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
index 76fa09f12b..83941f1dc1 100644
--- a/src/runtime/CPP/functions/CPPPermute.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,14 @@
#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
+#include "src/common/utils/Log.h"
+
using namespace arm_compute;
void CPPPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, perm);
+
auto k = std::make_unique<CPPPermuteKernel>();
k->configure(input, output, perm);
_kernel = std::move(k);
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index 2547e56a1d..3d64def804 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,16 +25,23 @@
#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
{
+ ARM_COMPUTE_LOG_PARAMS(predictions, targets, output, k);
+
auto kernel = std::make_unique<CPPTopKVKernel>();
kernel->configure(predictions, targets, output, k);
_kernel = std::move(kernel);
}
-Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKV::validate(const ITensorInfo *predictions,
+ const ITensorInfo *targets,
+ ITensorInfo *output,
+ const unsigned int k)
{
return CPPTopKVKernel::validate(predictions, targets, output, k);
}
diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
index 3b4ba2ba42..8f72473aeb 100644
--- a/src/runtime/CPP/functions/CPPUpsample.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,14 @@
#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+#include "src/common/utils/Log.h"
+
using namespace arm_compute;
void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, info);
+
auto k = std::make_unique<CPPUpsampleKernel>();
k->configure(input, output, info);
_kernel = std::move(k);
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index df04fed401..ecf84abd2c 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,15 @@
#include "arm_compute/core/CPP/ICPPKernel.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Log.h"
#include "arm_compute/core/Window.h"
+
#include "src/common/cpuinfo/CpuInfo.h"
#include "src/runtime/SchedulerUtils.h"
namespace arm_compute
{
IScheduler::IScheduler()
- : _cpu_info()
{
// Work out the best possible number of execution threads
_num_threads_hint = cpuinfo::num_threads_hint();
@@ -40,7 +41,7 @@ IScheduler::IScheduler()
CPUInfo &IScheduler::cpu_info()
{
- return _cpu_info;
+ return CPUInfo::get();
}
void IScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func)
@@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
#ifndef BARE_METAL
const Window &max_window = window;
- if(hints.split_dimension() == IScheduler::split_dimensions_all)
+ if (hints.split_dimension() == IScheduler::split_dimensions_all)
{
/*
* if the split dim is size_t max then this signals we should parallelise over
@@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
std::vector<IScheduler::Workload> workloads;
- for(unsigned int ni = 0; ni != n_threads; ++ni)
+ for (unsigned int ni = 0; ni != n_threads; ++ni)
{
- for(unsigned int mi = 0; mi != m_threads; ++mi)
+ for (unsigned int mi = 0; mi != m_threads; ++mi)
{
workloads.push_back(
- [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
- {
- //narrow the window to our mi-ni workload
- Window win = max_window.split_window(Window::DimX, mi, m_threads)
- .split_window(Window::DimY, ni, n_threads);
+ [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info)
+ {
+ //narrow the window to our mi-ni workload
+ Window win = max_window.split_window(Window::DimX, mi, m_threads)
+ .split_window(Window::DimY, ni, n_threads);
- win.validate();
+ win.validate();
- Window thread_locator;
- thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
- thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+ Window thread_locator;
+ thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+ thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
- thread_locator.validate();
+ thread_locator.validate();
- kernel->run_nd(win, info, thread_locator);
- });
+ kernel->run_nd(win, info, thread_locator);
+ });
}
}
run_workloads(workloads);
@@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
const unsigned int num_threads = std::min(num_iterations, this->num_threads());
- if(num_iterations == 0)
+ if (num_iterations == 0)
{
return;
}
- if(!kernel->is_parallelisable() || num_threads == 1)
+ if (!kernel->is_parallelisable() || num_threads == 1)
{
ThreadInfo info;
- info.cpu_info = &_cpu_info;
- if(tensors.empty())
+ info.cpu_info = &cpu_info();
+ if (tensors.empty())
{
kernel->run(max_window, info);
}
@@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
else
{
unsigned int num_windows = 0;
- switch(hints.strategy())
+ switch (hints.strategy())
{
case StrategyHint::STATIC:
num_windows = num_threads;
break;
case StrategyHint::DYNAMIC:
{
- const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+ const unsigned int granule_threshold =
+ (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
// Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
break;
@@ -139,16 +141,19 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
default:
ARM_COMPUTE_ERROR("Unknown strategy");
}
+ // Make sure the smallest window is larger than minimum workload size
+ num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
+
std::vector<IScheduler::Workload> workloads(num_windows);
- for(unsigned int t = 0; t < num_windows; ++t)
+ for (unsigned int t = 0; t < num_windows; ++t)
{
//Capture 't' by copy, all the other variables by reference:
- workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+ workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
{
Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
win.validate();
- if(tensors.empty())
+ if (tensors.empty())
{
kernel->run(win, info);
}
@@ -172,4 +177,44 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch
run_workloads(workloads);
}
+std::size_t IScheduler::adjust_num_of_windows(const Window &window,
+ std::size_t split_dimension,
+ std::size_t init_num_windows,
+ const ICPPKernel &kernel,
+ const CPUInfo &cpu_info)
+{
+ // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
+ if (window.num_iterations(split_dimension) < init_num_windows)
+ {
+ auto recommended_split_dim = Window::DimX;
+ for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+ {
+ if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+ {
+ recommended_split_dim = dims;
+ }
+ }
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+ "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim",
+ split_dimension, recommended_split_dim);
+ }
+
+ for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+ {
+ // Try splitting the workload into t, subject to each subworkload size <= mws.
+ if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+ {
+ if (t != init_num_windows)
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_CORE(
+ "The scheduler is using a different thread count than the one assigned by the user.");
+ }
+ return t;
+ }
+ }
+ ARM_COMPUTE_LOG_INFO_MSG_CORE(
+ "The scheduler is using single thread instead of the thread count assigned by the user.");
+ return 1; // If the workload is so small that it can't be split, we should run a single thread
+}
+
} // namespace arm_compute
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index a6bc950644..8e5b62ae7d 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager()
void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
{
- if(_active_group == nullptr)
+ if (_active_group == nullptr)
{
ARM_COMPUTE_ERROR_ON(group == nullptr);
_active_group = group;
@@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
{
- if(group == nullptr)
+ if (group == nullptr)
{
return false;
}
const bool status = bool(_finalized_groups.erase(group));
- if(status)
+ if (status)
{
group->mappings().clear();
}
@@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
void ISimpleLifetimeManager::start_lifetime(void *obj)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+ ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements),
+ "Memory object is already registered!");
// Check if there is a free blob
- if(_free_blobs.empty())
+ if (_free_blobs.empty())
{
- _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
+ _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}});
}
else
{
@@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
el.status = true;
// Find object in the occupied lists
- auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
- {
- return obj == b.id;
- });
+ auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs),
+ [&obj](const Blob &b) { return obj == b.id; });
ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
// Update occupied blob and return as free
@@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
_free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
// Check if all objects are finalized and reset active group
- if(are_all_finalized())
+ if (are_all_finalized())
{
ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
@@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
bool ISimpleLifetimeManager::are_all_finalized() const
{
- return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
- {
- return !e.second.status;
- });
+ return !std::any_of(std::begin(_active_elements), std::end(_active_elements),
+ [](const std::pair<void *, Element> &e) { return !e.second.status; });
}
} // namespace arm_compute
diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index 081cd990f3..96287dcc49 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,23 +25,27 @@
namespace arm_compute
{
-IWeightsManager::IWeightsManager()
- : _managed_weights(), _managed_weights_parents()
+IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents()
{
}
void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
{
- if(!are_weights_managed(weights))
+ if (!are_weights_managed(weights))
{
_managed_weights[weights];
+ _managed_counter[weights];
+ }
+ else
+ {
+ _managed_counter[weights].counter++;
}
// In case the weights are an output of a previous reshape function
// store the parent's link
- if(parent != nullptr)
+ if (parent != nullptr)
{
- if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+ if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
{
_managed_weights_parents[weights] = parent;
}
@@ -54,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
// Find if I have the same weights with weights transform. If I do, don't run the reshape
auto item = _managed_weights.find(weights);
- bool perform_run{ true };
- ITensor *weights_tensor{ nullptr };
+ bool perform_run{true};
+ ITensor *weights_tensor{nullptr};
// Check if I already have the requested transform and I have run the reshape function
- for(auto it : item->second)
+ for (auto it : item->second)
{
- if(it->is_reshape_run() && (it->uid() == weights_transform->uid()))
+ if (it->is_reshape_run() && (it->uid() == weights_transform->uid()))
{
weights_tensor = it->get_weights();
perform_run = false;
@@ -68,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
}
}
- if(perform_run)
+ if (perform_run)
{
weights_transform->run();
weights_tensor = weights_transform->get_weights();
@@ -76,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
// Check if we can release memory from parent
auto parent_item = _managed_weights_parents.find(weights);
- if(parent_item != _managed_weights_parents.end())
+ if (parent_item != _managed_weights_parents.end())
{
int32_t refcount = parent_item->second->decrease_refcount();
- if(refcount == 0)
+ if (refcount == 0)
{
parent_item->second->release();
}
@@ -87,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
// Check top level weights. If all the transformations are done
// mark the weights as unused
- if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+ if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
{
auto item = _managed_weights.find(weights);
bool mark_as_unused = true;
- for(auto it : item->second)
+ for (auto it : item->second)
{
- if(!it->is_reshape_run())
+ if (!it->is_reshape_run())
{
mark_as_unused = false;
break;
}
}
- if(mark_as_unused)
+ if (mark_as_unused)
{
weights->mark_as_unused();
}
@@ -118,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
{
ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed");
- ITensor *transformed_weights{ nullptr };
+ ITensor *transformed_weights{nullptr};
auto item = _managed_weights.find(weights);
// Check if I already have the requested transform. If I do,
// increase the refcount of the transformed weights object and
// reuse the tensor
- for(auto it : item->second)
+ for (auto it : item->second)
{
- if(it->uid() == weights_transform->uid())
+ if (it->uid() == weights_transform->uid())
{
transformed_weights = it->get_weights();
it->increase_refcount();
@@ -134,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
}
}
- if(transformed_weights == nullptr)
+ if (transformed_weights == nullptr)
{
transformed_weights = weights_transform->get_weights();
weights_transform->increase_refcount();
@@ -146,4 +150,28 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
return transformed_weights;
}
+
+void IWeightsManager::release(const ITensor *weights)
+{
+ if (weights == nullptr || !are_weights_managed(weights))
+ {
+ return;
+ }
+
+ _managed_counter[weights].counter--;
+ if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
+ {
+ weights->mark_as_unused();
+ }
+}
+
+void IWeightsManager::pre_mark_as_unused(const ITensor *weights)
+{
+ if (weights == nullptr || !are_weights_managed(weights))
+ {
+ return;
+ }
+
+ _managed_counter[weights].is_unused = true;
+}
} // namespace arm_compute
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index ac0a32539e..90fd025eb7 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -27,20 +27,17 @@
namespace arm_compute
{
-Memory::Memory()
- : _region(nullptr), _region_owned(nullptr)
+Memory::Memory() : _region(nullptr), _region_owned(nullptr)
{
}
-Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
- : _region(nullptr), _region_owned(memory)
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
}
-Memory::Memory(IMemoryRegion *memory)
- : _region(memory), _region_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
{
_region = memory;
}
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 2e418ae9e3..5fa9ea47e9 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -31,7 +31,8 @@
namespace arm_compute
{
-MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+ std::shared_ptr<IPoolManager> pool_manager)
: _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
{
ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
@@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t
// Create pools
auto pool_template = _lifetime_mgr->create_pool(&allocator);
- for(int i = num_pools; i > 1; --i)
+ for (int i = num_pools; i > 1; --i)
{
auto pool = pool_template->duplicate();
_pool_mgr->register_pool(std::move(pool));
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index a5fc0a2726..fcfd3251ff 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/INEOperator.h"
+
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -32,14 +34,13 @@ namespace experimental
{
INEOperator::~INEOperator() = default;
-INEOperator::INEOperator(IRuntimeContext *ctx)
- : _kernel(), _ctx(ctx), _workspace()
+INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
{
}
void INEOperator::run(ITensorPack &tensors)
{
- if(tensors.empty())
+ if (tensors.empty())
{
ARM_COMPUTE_ERROR("No inputs provided");
}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 5438bce62a..b6977221b9 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CPP/ICPPKernel.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
namespace arm_compute
@@ -33,8 +34,7 @@ namespace arm_compute
INESimpleFunction::~INESimpleFunction() = default;
INESimpleFunction::INESimpleFunction() // NOLINT
- : _kernel(),
- _border_handler()
+ : _kernel(), _border_handler()
{
}
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 21dd58e378..04bff9fa4b 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/NEON/INEKernel.h"
#include "src/runtime/Utils.h"
@@ -32,9 +33,7 @@ namespace arm_compute
{
INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
-INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
- : _kernel(),
- _ctx(ctx)
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx)
{
}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 2b5c51fa5a..59199452ce 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,24 +24,24 @@
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
+
+#include "src/cpu/operators/CpuActivation.h"
namespace arm_compute
{
struct NEActivationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- IRuntimeContext *ctx{ nullptr };
- std::unique_ptr<cpu::CpuActivation> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ IRuntimeContext *ctx{nullptr};
+ std::unique_ptr<cpu::CpuActivation> op{nullptr};
};
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
NEActivationLayer::~NEActivationLayer() = default;
@@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay
_impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
}
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
return cpu::CpuActivation::validate(input, output, act_info);
}
diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
new file mode 100644
index 0000000000..a72364791c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
+namespace arm_compute
+{
+struct NEAddMulAdd::Impl
+{
+ std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
+ WorkspaceData<Tensor> workspace_tensors{};
+ ITensorPack run_pack{};
+ MemoryGroup memory_group{};
+};
+
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+NEAddMulAdd::~NEAddMulAdd() = default;
+
+void NEAddMulAdd::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *bn_mul,
+ ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ const ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+
+ _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+ _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+ add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+
+ _impl->run_pack = {
+ {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul},
+ {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
+ };
+
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEAddMulAdd::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
+{
+ return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+}
+
+void NEAddMulAdd::run()
+{
+ _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 7bca20d46c..fbaf1a96e7 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,31 +29,68 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
+struct NEArgMinMaxLayer::Impl
+{
+ MemoryGroup memory_group{};
+ std::shared_ptr<IMemoryManager> memory_manager{};
+ std::unique_ptr<NEReductionOperation> reduction_function{};
+ std::unique_ptr<NECast> cast_function{};
+ std::unique_ptr<Tensor> tmp_reduction_result{};
+};
+
NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _reduction_function(std::make_unique<NEReductionOperation>())
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
- ARM_COMPUTE_UNUSED(memory_manager);
+ _impl->memory_manager = std::move(memory_manager);
}
+
void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
{
- _reduction_function->configure(input, output, axis, op, false);
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
+ _impl->reduction_function = std::make_unique<NEReductionOperation>();
+ if (output->info() &&
+ (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+ {
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->cast_function = std::make_unique<NECast>();
+ _impl->tmp_reduction_result = std::make_unique<Tensor>();
+ _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false);
+ _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE);
+ _impl->memory_group.manage(_impl->tmp_reduction_result.get());
+ _impl->tmp_reduction_result->allocator()->allocate();
+ }
+ else
+ {
+ _impl->reduction_function->configure(input, output, axis, op, false);
+ }
}
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Invalid operation");
return NEReductionOperation::validate(input, output, axis, op, false);
}
void NEArgMinMaxLayer::run()
{
- _reduction_function->run();
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->reduction_function->run();
+ if (_impl->tmp_reduction_result != nullptr)
+ {
+ _impl->cast_function->run();
+ }
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 2e4755b949..aff16ae9d1 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuAdd.h"
+
+#include "src/cpu/operators/CpuAdd.h"
#include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
{
struct NEArithmeticAddition::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuAdd> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuAdd> op{nullptr};
};
-NEArithmeticAddition::NEArithmeticAddition()
- : _impl(std::make_unique<Impl>())
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
{
}
-NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
NEArithmeticAddition::~NEArithmeticAddition() = default;
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticAddition::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
}
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticAddition::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 0263d4cbb6..097525c1a8 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuSub.h"
+
+#include "src/cpu/operators/CpuSub.h"
#include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
{
struct NEArithmeticSubtraction::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuSub> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuSub> op{nullptr};
};
-NEArithmeticSubtraction::NEArithmeticSubtraction()
- : _impl(std::make_unique<Impl>())
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
{
}
-NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
NEArithmeticSubtraction::~NEArithmeticSubtraction() = default;
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
}
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index b90a38b47f..d491f0aafc 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,29 +29,44 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
namespace arm_compute
{
NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
- : _norm_kernel()
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
{
}
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
ActivationLayerInfo act_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
// Configure kernel
_norm_kernel = std::make_unique<NEBatchNormalizationLayerKernel>();
_norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
}
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
return Status{};
}
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index 8f537a650a..5d711c5ddf 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,40 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
namespace arm_compute
{
void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
k->configure(input, block_shape, output);
_kernel = std::move(k);
}
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+void NEBatchToSpaceLayer::configure(
+ const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
{
auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
- k->configure(input, block_shape_x, block_shape_y, output);
+ k->configure(input, block_shape_x, block_shape_y, output, crop_info);
_kernel = std::move(k);
}
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
}
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
- return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 81c087988a..89ce2087be 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseAndKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 3155df5db3..eda59cd3e9 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output);
auto k = std::make_unique<NEBitwiseNotKernel>();
k->configure(input, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 793eb25d80..3d6f30b0fe 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseOrKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 2d0af63e35..f0cf3d3e5c 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseXorKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index cfd14faca0..adf891e417 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,28 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
namespace arm_compute
{
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
// Configure Bounding Box kernel
auto k = std::make_unique<NEBoundingBoxTransformKernel>();
k->configure(boxes, pred_boxes, deltas, info);
_kernel = std::move(k);
}
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
}
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index b519576ad5..1fd172a730 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,22 +24,23 @@
#include "arm_compute/runtime/NEON/functions/NECast.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuCast.h"
namespace arm_compute
{
struct NECast::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCast> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCast> op{nullptr};
};
-NECast::NECast()
- : _impl(std::make_unique<Impl>())
+NECast::NECast() : _impl(std::make_unique<Impl>())
{
}
-NECast::NECast(NECast &&) = default;
+NECast::NECast(NECast &&) = default;
NECast &NECast::operator=(NECast &&) = default;
NECast::~NECast() = default;
@@ -49,19 +50,19 @@ void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy)
_impl->dst = output;
ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
-
+ ARM_COMPUTE_LOG_PARAMS(input, output, policy);
_impl->op = std::make_unique<cpu::CpuCast>();
_impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
}
-Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy)
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
{
return cpu::CpuCast::validate(input, output, policy);
}
void NECast::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index bf4af83a0d..86bee4dd43 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,12 +24,15 @@
#include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
namespace arm_compute
{
void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
auto k = std::make_unique<NEChannelShuffleLayerKernel>();
k->configure(input, output, num_groups);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index dcc5cd3a64..59a0892f1f 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,33 +23,31 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
-#include "src/runtime/cpu/operators/CpuConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
namespace arm_compute
{
struct NEConcatenateLayer::Impl
{
std::vector<const ITensor *> srcs{};
- ITensor *dst{ nullptr };
- unsigned int num_inputs{ 0 };
- unsigned int axis{ 0 };
- std::unique_ptr<cpu::CpuConcatenate> op{ nullptr };
+ ITensor *dst{nullptr};
+ unsigned int num_inputs{0};
+ unsigned int axis{0};
+ std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
};
-NEConcatenateLayer::NEConcatenateLayer()
- : _impl(std::make_unique<Impl>())
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
{
}
-NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
NEConcatenateLayer::~NEConcatenateLayer() = default;
@@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
_impl->op = std::make_unique<cpu::CpuConcatenate>();
std::vector<const ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+ for (unsigned int i = 0; i < inputs_vector.size(); ++i)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
_impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
}
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+ const ITensorInfo *output,
+ size_t axis)
{
return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
}
@@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
void NEConcatenateLayer::run()
{
ITensorPack pack;
- for(unsigned i = 0; i < _impl->num_inputs; ++i)
+ for (unsigned i = 0; i < _impl->num_inputs; ++i)
{
pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
}
diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
new file mode 100644
index 0000000000..8f41151d6c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConv3D.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDirectConv3d.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::experimental;
+
+struct NEConv3D::Impl
+{
+ std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+ ITensorPack run_pack{};
+};
+
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEConv3D::~NEConv3D() = default;
+
+void NEConv3D::configure(
+ ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
+
+ auto f = std::make_unique<cpu::CpuDirectConv3d>();
+ f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+ conv_info);
+ _impl->op = std::move(f);
+
+ if (_impl->op != nullptr)
+ {
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ }
+}
+
+Status NEConv3D::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv3dInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
+
+ return Status{};
+}
+
+void NEConv3D::run()
+{
+ if (_impl->op != nullptr)
+ {
+ _impl->op->run(_impl->run_pack);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f2253d8be4..84e8565aaf 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -23,24 +23,27 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
namespace arm_compute
{
struct NEConvertFullyConnectedWeights::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
};
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
- : _impl(std::make_unique<Impl>())
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
{
}
NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor *input,
+ ITensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -50,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou
_impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
}
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
}
@@ -63,4 +68,4 @@ void NEConvertFullyConnectedWeights::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index e43d976944..8efebbbb1a 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,235 +25,184 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
-#include <cmath>
-#include <tuple>
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuConv2d.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
namespace arm_compute
{
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
- : _memory_manager(std::move(memory_manager)),
- _function()
+using namespace arm_compute::experimental;
+
+struct NEConvolutionLayer::Impl
+{
+ MemoryGroup memory_group{};
+ std::shared_ptr<IMemoryManager> memory_manager{};
+ std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+ std::unique_ptr<IFunction> func{nullptr};
+};
+
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
+ _impl->memory_manager = std::move(memory_manager);
}
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+NEConvolutionLayer::~NEConvolutionLayer() = default;
+
+void NEConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- {
- auto f = std::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::GEMM:
- {
- auto f = std::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::GEMM_CONV2D:
- {
- auto f = std::make_unique<NEGEMMConv2d>(_memory_manager);
- f->configure(input, weights, biases, output, info);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::DIRECT:
{
- auto f = std::make_unique<NEDirectConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info);
- _function = std::move(f);
+ auto f = std::make_unique<cpu::CpuConv2d>();
+ f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+ output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ _impl->op = std::move(f);
break;
}
case ConvolutionMethod::FFT:
{
- auto f = std::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+ auto f = std::make_unique<NEFFTConvolutionLayer>(_impl->memory_manager);
f->configure(input, weights, biases, output, conv_info, act_info);
- _function = std::move(f);
+ _impl->func = std::move(f);
break;
}
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
}
+
+ if (_impl->op)
+ {
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ }
}
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
-
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+
+ // Biases with dynamic values are not supported with quantized inputs.
+ if (biases)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())),
+ "Dynamic Biases are not supported with quantized input data.");
+ }
+
+ switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
- break;
case ConvolutionMethod::GEMM:
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
- break;
case ConvolutionMethod::GEMM_CONV2D:
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info));
- break;
case ConvolutionMethod::DIRECT:
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+ weights_info, dilation, act_info, enable_fast_math,
+ num_groups));
break;
case ConvolutionMethod::FFT:
- ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
break;
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
}
-
return Status{};
}
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
- ARM_COMPUTE_UNUSED(weights_info);
-
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+ return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math);
+}
- /* Input spatial dims, kernel size, IFM/OFM, conv info*/
- using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
- using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+void NEConvolutionLayer::run()
+{
+ prepare();
- const std::vector<ConfigurationMethod> known_configs =
- {
- // Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
- // VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
- };
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
- const auto find_config = [&](ConfigurationMethod c)
+ if (_impl->func)
{
- const ConvolutionConfiguration config = c.first;
- const PadStrideInfo info = std::get<3>(config);
-
- return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
- };
-
- std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ _impl->func->run();
+ }
+ else
{
- return (*found).second;
+ _impl->op->run(_impl->run_pack);
}
+}
- if(dilation != Size2D(1U, 1U))
+void NEConvolutionLayer::prepare()
+{
+ if (_impl->func)
{
- return ConvolutionMethod::GEMM;
+ _impl->func->prepare();
}
else
{
- // SRGAN
- // Output might not be initialized when it is an internal tensor of the layer using the convolution
- if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
- && (NEDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::DIRECT;
- }
- if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::FFT;
- }
- if(input->dimension(idx_c) < 16)
- {
- return ConvolutionMethod::GEMM;
- }
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- // This heuristics only applies to F16 data type on A55r1
- if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
- {
- // Exclude known bad winograd configs (and defaults to GEMM)
- const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
- {
- // Squeezenet_V1_1 fire2 and fire3
- ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
- // Squeezenet_V1_1 fire6 and fire7
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
- // Squeezenet_V1_1 fire8 and fire9
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
- };
- const auto find_conv_config = [&](ConvolutionConfiguration c)
- {
- const PadStrideInfo info = std::get<3>(c);
-
- return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
- };
-
- bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
- find_conv_config)
- != known_bad_winograd_f16_with_fastmath_configs.end();
- if(found_bad)
- {
- return ConvolutionMethod::GEMM;
- }
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- // For 1x1 convolutions run the default GEMM
- if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
- {
- return ConvolutionMethod::GEMM;
- }
+ _impl->op->prepare(_impl->prep_pack);
- if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
- {
- return ConvolutionMethod::WINOGRAD;
- }
- if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info)))
- {
- return ConvolutionMethod::GEMM_CONV2D;
- }
- return ConvolutionMethod::GEMM;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
}
}
-
-void NEConvolutionLayer::run()
-{
- prepare();
- _function->run();
-}
-
-void NEConvolutionLayer::prepare()
-{
- _function->prepare();
-}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 20642b5eed..c975d3a5b5 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NECopy.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCopy.h"
+
+#include "src/cpu/operators/CpuCopy.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NECopy::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCopy> op{nullptr};
};
-NECopy::NECopy()
- : _impl(std::make_unique<Impl>())
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
{
}
-NECopy::NECopy(NECopy &&) = default;
+NECopy::NECopy(NECopy &&) = default;
NECopy &NECopy::operator=(NECopy &&) = default;
NECopy::~NECopy() = default;
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index 1e1070d961..a94b0882da 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -21,10 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NECropKernel.h"
#include <cstddef>
@@ -34,18 +36,32 @@ namespace arm_compute
NECropResize::~NECropResize() = default;
NECropResize::NECropResize()
- : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+ : _output(nullptr),
+ _num_boxes(0),
+ _method(),
+ _extrapolation_value(0),
+ _crop(),
+ _scale(),
+ _crop_results(),
+ _scaled_results()
{
}
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo *input,
+ const ITensorInfo *boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
TensorInfo temp_info;
- ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
- if(output->total_size() > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+ box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+ extrapolation_value));
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -55,11 +71,18 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes
return Status{};
}
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor *input,
+ const ITensor *boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+ crop_size, method, extrapolation_value));
+ ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
_num_boxes = boxes->info()->tensor_shape()[1];
TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
@@ -79,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I
_scaled_results.reserve(_num_boxes);
_scale.reserve(_num_boxes);
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
auto crop_tensor = std::make_unique<Tensor>();
TensorInfo crop_result_info(1, DataType::F32);
@@ -106,7 +129,7 @@ void NECropResize::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
// Size of the crop box in _boxes and thus the shape of _crop_results[i]
// may not be known until run-time and so the kernels cannot be configured until then.
@@ -115,12 +138,15 @@ void NECropResize::run()
NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
// Scale the cropped image.
- _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
+ _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+ ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+ SamplingPolicy::TOP_LEFT, false});
_scaled_results[i]->allocator()->allocate();
_scale[i]->run();
// Copy scaled image into output.
- std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+ std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+ _output->ptr_to_element(Coordinates(0, 0, 0, i)));
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 5bd61b4074..081c7cc538 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute::misc::shape_calculator;
@@ -61,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p
deconv_pad_top += deconv_pad_y / 2;
deconv_pad_bottom += deconv_pad_y / 2;
- return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+ return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+ DimensionRoundingType::FLOOR);
}
-
} // namespace
NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
@@ -77,20 +78,29 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
_original_weights(nullptr),
_input(nullptr),
_info(),
- _is_prepared(false)
+ _is_prepared(false),
+ _do_upsampling(true)
{
}
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
- if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
}
@@ -99,11 +109,23 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
}
- auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+ const unsigned int pad_left = info.pad_left();
+ const unsigned int pad_top = info.pad_top();
+ const unsigned int pad_right = info.pad_right();
+ const unsigned int pad_bottom = info.pad_bottom();
+
+ ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first +
+ weights->dimension(width_idx)) < (pad_left + pad_right));
+ ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second +
+ weights->dimension(height_idx)) < (pad_top + pad_bottom));
- if(bias != nullptr)
+ auto out_dims =
+ deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+ weights->dimension(width_idx), weights->dimension(height_idx), info);
+
+ if (bias != nullptr)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -113,57 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
}
}
- if(output->tensor_shape().total_size() > 0)
+ if (output->tensor_shape().total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+ "Output's depth is invalid.");
}
- uint32_t deconv_pad_x = 0;
- uint32_t deconv_pad_y = 0;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
- // Guard against overflows in compute_deconvolution_upsampled_shape()
- const DataLayout data_layout = input->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int out_x = (input->dimension(idx_w) - 1) * stride_x + 1;
- const unsigned int out_y = (input->dimension(idx_h) - 1) * stride_y + 1;
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) > out_x);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) > out_y);
- ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first);
- ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second);
-
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ uint32_t deconv_pad_x = 0;
+ uint32_t deconv_pad_y = 0;
+ const uint32_t stride_x = info.stride().first;
+ const uint32_t stride_y = info.stride().second;
+ const auto deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x),
+ static_cast<int32_t>(stride_y), out_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0,
+ "Negative padding not supported");
+
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+ out_dims, deconv_pad_x, deconv_pad_y);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+ const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+
+ // Do not perform upsampling when the operation uses unit stride in all dimensions
+ const bool do_upsampling = stride_x != 1 || stride_y != 1;
+
+ const unsigned int batches_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+ if (do_upsampling)
+ {
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+ weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+ enable_fast_math));
+ }
+ else
+ {
+ const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+ upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+ Size2D(1U, 1U), ActivationLayerInfo(),
+ enable_fast_math));
+ }
return Status{};
}
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+void NEDeconvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math,
+ const WeightsInfo &weights_info)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+ (bias == nullptr) ? nullptr : bias->info(),
+ output->info(), info, enable_fast_math, weights_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
- weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+ auto out_dims = deconvolution_output_dimensions(
+ input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
@@ -176,32 +225,24 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
const unsigned int stride_y = info.stride().second;
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
- _memory_group.manage(&_scaled_output);
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
// setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- uint32_t deconv_pad_x = 0;
- uint32_t deconv_pad_y = 0;
-
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
- stride_x, stride_y,
- out_dims, deconv_pad_x, deconv_pad_y);
+ uint32_t deconv_pad_x = 0;
+ uint32_t deconv_pad_y = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+ *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
- scale_out_info.set_data_layout(data_layout);
- _scaled_output.allocator()->init(scale_out_info);
-
- _upsample_f.configure(input, &_scaled_output, upsample_info);
-
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+ // Do not perform upsampling when the operation uses unit stride in all dimensions
+ _do_upsampling = stride_x != 1 || stride_y != 1;
// Setup flip axis data
_flip_axis.allocator()->allocate();
@@ -209,7 +250,32 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
axis_data[0] = static_cast<uint32_t>(width_idx);
axis_data[1] = static_cast<uint32_t>(height_idx);
- _scaled_output.allocator()->allocate();
+ // Setup convolution and upsampling, if needed
+ if (_do_upsampling)
+ {
+ _memory_group.manage(&_scaled_output);
+
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // Minor optimization: In the upsampling step, we do not need to allocate space for the padding in the upsampled image.
+ // The padding amount can be given as input to the convolution layer.
+ _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+ ActivationLayerInfo(), enable_fast_math);
+
+ _scaled_output.allocator()->allocate();
+ }
+ else
+ {
+ const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+ upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+ _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+ ActivationLayerInfo(), enable_fast_math);
+ }
}
void NEDeconvolutionLayer::run()
@@ -218,13 +284,16 @@ void NEDeconvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
- _upsample_f.run();
+ if (_do_upsampling)
+ {
+ _upsample_f.run();
+ }
_conv_f.run();
}
void NEDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 07e985c25e..766635dfa1 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+
+#include "src/cpu/operators/CpuCast.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NEDepthConvertLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCast> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCast> op{nullptr};
};
-NEDepthConvertLayer::NEDepthConvertLayer()
- : _impl(std::make_unique<Impl>())
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
{
}
-NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
NEDepthConvertLayer::~NEDepthConvertLayer() = default;
@@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve
_impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
}
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
return cpu::CpuCast::validate(input, output, policy);
@@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
void NEDepthConvertLayer::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 2793c3f27e..5eea4dca65 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,24 @@
#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
namespace arm_compute
{
+NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{}
+{
+}
+
+NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default;
+
void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
auto k = std::make_unique<NEDepthToSpaceLayerKernel>();
k->configure(input, output, block_shape);
_kernel = std::move(k);
@@ -43,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo
{
return NEDepthToSpaceLayerKernel::validate(input, output, block_shape);
}
+
+void NEDepthToSpaceLayer::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension());
+}
+
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a561b88058..6c085645db 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,9 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
@@ -38,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
{
- ITensor *src{ nullptr }; // SRC_0
- ITensor *dst{ nullptr }; // DST_0
- const ITensor *weights
- {
- nullptr
- }; // SRC_1
- const ITensor *biases
- {
- nullptr
- }; // SRC_2
+ ITensor *src{nullptr}; // SRC_0
+ ITensor *dst{nullptr}; // DST_0
+ const ITensor *weights{nullptr}; // SRC_1
+ const ITensor *biases{nullptr}; // SRC_2
Tensor permuted_input{}; // INT_0
Tensor permuted_weights{}; // INT_1
Tensor permuted_output{}; // INT_2
Tensor workspace{}; // INT_3
Tensor packed_weights{}; // INT_4
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
- bool is_prepared{ false };
- bool permute{ false };
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+ bool is_prepared{false};
+ bool permute{false};
};
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+ std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _impl(std::make_unique<Impl>())
{
}
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+ ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -81,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
_impl->permute = is_nhwc;
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
- _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
- _impl->dst->info(), info);
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+ _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+ _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
// Configure pipeline
ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
@@ -91,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
- if(!is_activationlayer_enabled)
+ if (!is_activationlayer_enabled)
{
act_info_to_use = act_info;
}
- info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
+ info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
- if(is_nhwc)
+ if (is_nhwc)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -121,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
_impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
// Configure optimized depthwise
- dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
+ dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+ biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+ info);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
@@ -132,28 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
}
else
{
- dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
+ dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+ biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
}
// Allocate memory based on the internal memory requirements
experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
- _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment);
- _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment);
-
+ _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+ mem_req[0].alignment);
+ _impl->packed_weights.allocator()->init(
+ TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
+ _memory_group.manage(&_impl->workspace);
+ _memory_group.manage(&_impl->packed_weights);
_impl->workspace.allocator()->allocate();
_impl->packed_weights.allocator()->allocate();
}
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
@@ -178,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
// Permute weights
- if(_impl->permute)
+ if (_impl->permute)
{
_impl->permuted_weights.allocator()->allocate();
}
- if(!_impl->permuted_weights.is_used())
+ if (!_impl->permuted_weights.is_used())
{
_impl->permuted_weights.allocator()->free();
}
@@ -200,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
Tensor permuted_input{};
Tensor permuted_weights{};
Tensor permuted_output{};
- bool is_prepared{ false };
- bool is_nchw{ false };
- bool is_activationlayer_enabled{ false };
- const ITensor *weights{ nullptr };
- const ITensor *biases{ nullptr };
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+ bool is_prepared{false};
+ bool is_nchw{false};
+ bool is_activationlayer_enabled{false};
+ const ITensor *weights{nullptr};
+ const ITensor *biases{nullptr};
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
};
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -215,16 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv
{
}
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
- output->info(), conv_info, depth_multiplier, act_info, dilation));
- const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
- _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+ _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+ info);
_impl->src = input;
_impl->dst = output;
@@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = output;
- if(_impl->is_nchw)
+ if (_impl->is_nchw)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
weights_to_use = &_impl->permuted_weights;
- _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ _impl->permuted_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
output_to_use = &_impl->permuted_output;
}
auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
- depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
+ depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+ biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
- if(_impl->is_nchw)
+ if (_impl->is_nchw)
{
auto permute_output = std::make_unique<cpu::CpuPermute>();
permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
@@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
}
}
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
@@ -298,43 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
#ifndef DOXYGEN_SKIP_THIS
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
{
- DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
- NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+ DepthwiseConvolutionFunction depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+ NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
NEDepthwiseConvolutionLayerGeneric func_generic{};
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
};
#endif // DOXYGEN_SKIP_THIS
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+ input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+ depth_multiplier, act_info, dilation));
+
+ const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
_impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>();
- _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- info);
- switch(_impl->depth_conv_func)
+ _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+ input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
- _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+ dilation);
break;
case DepthwiseConvolutionFunction::GENERIC:
- _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+ dilation);
break;
default:
ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
}
}
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::run()
{
- switch(_impl->depth_conv_func)
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.run();
@@ -349,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run()
void NEDepthwiseConvolutionLayer::prepare()
{
- switch(_impl->depth_conv_func)
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.prepare();
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 91e37594af..28d19d2950 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,19 +26,19 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuDequantize.h"
+
+#include "src/cpu/operators/CpuDequantize.h"
namespace arm_compute
{
struct NEDequantizationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuDequantize> op{nullptr};
};
-NEDequantizationLayer::NEDequantizationLayer()
- : _impl(std::make_unique<Impl>())
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
{
}
NEDequantizationLayer::~NEDequantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 9e63800728..b347390162 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "src/common/utils/Log.h"
+
#include <cstddef>
#include <ios>
#include <list>
@@ -34,23 +36,36 @@
namespace arm_compute
{
NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+ : _memory_group(std::move(memory_manager)),
+ _dequantize(),
+ _detection_post_process(),
+ _decoded_scores(),
+ _run_dequantize(false)
{
}
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
- ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
+ const ITensor *input_scores,
+ const ITensor *input_anchors,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection,
+ DetectionPostProcessLayerInfo info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
- ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
- output_scores->info(),
- num_detection->info(), info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+ output_scores);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+ input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+ output_classes->info(), output_scores->info(), num_detection->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+ num_detection, info);
const ITensor *input_scores_to_use = input_scores;
DetectionPostProcessLayerInfo info_to_use = info;
_run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type());
- if(_run_dequantize)
+ if (_run_dequantize)
{
_memory_group.manage(&_decoded_scores);
@@ -59,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c
input_scores_to_use = &_decoded_scores;
// Create a new info struct to avoid dequantizing in the CPP layer
- std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
- DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
- scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+ std::array<float, 4> scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+ info.scale_value_w()};
+ DetectionPostProcessLayerInfo info_quantized(
+ info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+ info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
info_to_use = info_quantized;
}
- _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+ _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+ output_classes, output_scores, num_detection, info_to_use);
_decoded_scores.allocator()->allocate();
}
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_scores,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
+ DetectionPostProcessLayerInfo info)
{
bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
- if(run_dequantize)
+ if (run_dequantize)
{
TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+ output_boxes, output_classes, output_scores,
+ num_detection, info));
return Status{};
}
@@ -88,7 +114,7 @@ void NEDetectionPostProcessLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Decode scores if necessary
- if(_run_dequantize)
+ if (_run_dequantize)
{
_dequantize.run();
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 58530e4a8f..f1c2cf969f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,18 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
+
+#include "src/cpu/operators/CpuDirectConv2d.h"
namespace arm_compute
{
struct NEDirectConvolutionLayer::Impl
{
- ITensor *src{ nullptr };
- const ITensor *weights{ nullptr };
- const ITensor *bias{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
+ ITensor *src{nullptr};
+ const ITensor *weights{nullptr};
+ const ITensor *bias{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
};
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManage
}
NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEDirectConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
_impl->src = input;
_impl->weights = weights;
_impl->bias = bias;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
- _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
+ _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+ conv_info, act_info);
}
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index 946bbb24b8..685ef2d4d7 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuElementwise.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
#include <utility>
@@ -33,17 +34,16 @@ namespace arm_compute
{
struct NEElementwiseMax::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
};
-NEElementwiseMax::NEElementwiseMax()
- : _impl(std::make_unique<Impl>())
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
NEElementwiseMax::~NEElementwiseMax() = default;
@@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseMax::validate(input1, input2, output);
@@ -74,17 +77,16 @@ void NEElementwiseMax::run()
struct NEElementwiseMin::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
};
-NEElementwiseMin::NEElementwiseMin()
- : _impl(std::make_unique<Impl>())
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
NEElementwiseMin::~NEElementwiseMin() = default;
@@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseMin::validate(input1, input2, output);
@@ -115,21 +120,23 @@ void NEElementwiseMin::run()
struct NEElementwiseSquaredDiff::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
};
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
- : _impl(std::make_unique<Impl>())
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default;
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
@@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run()
struct NEElementwiseDivision::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
};
-NEElementwiseDivision::NEElementwiseDivision()
- : _impl(std::make_unique<Impl>())
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
NEElementwiseDivision::~NEElementwiseDivision() = default;
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseDivision::validate(input1, input2, output);
@@ -197,21 +212,23 @@ void NEElementwiseDivision::run()
struct NEElementwisePower::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
};
-NEElementwisePower::NEElementwisePower()
- : _impl(std::make_unique<Impl>())
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
{
}
-NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
NEElementwisePower::~NEElementwisePower() = default;
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwisePower::validate(input1, input2, output);
@@ -239,22 +259,22 @@ void NEElementwisePower::run()
template <ComparisonOperation COP>
struct NEElementwiseComparisonStatic<COP>::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
};
template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
- : _impl(std::make_unique<Impl>())
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
{
}
template <ComparisonOperation COP>
NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation COP>
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
template <ComparisonOperation COP>
@@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp
}
template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output)
{
return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
}
template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::run()
+void NEElementwiseComparisonStatic<COP>::run()
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
@@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic<COP>::run()
struct NEElementwiseComparison::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
};
-NEElementwiseComparison::NEElementwiseComparison()
- : _impl(std::make_unique<Impl>())
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
NEElementwiseComparison::~NEElementwiseComparison() = default;
@@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso
_impl->op->configure(input1->info(), input2->info(), output->info(), op);
}
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status NEElementwiseComparison::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation op)
{
return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
}
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index 1a9e8839ca..23a092c407 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -22,7 +22,9 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
-#include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
+
+#include "src/cpu/operators/CpuElementwiseUnary.h"
+
#include <utility>
namespace arm_compute
@@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary;
template <ElementWiseUnary op>
struct NEElementwiseUnaryLayer<op>::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<OperatorType> cpu_op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<OperatorType> cpu_op{nullptr};
};
template <ElementWiseUnary op>
-NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer()
- : _impl(std::make_unique<Impl>())
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
{
}
template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
-template <ElementWiseUnary op>
+template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
template <ElementWiseUnary op>
@@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITe
}
template <ElementWiseUnary op>
-void NEElementwiseUnaryLayer<op>::run()
+void NEElementwiseUnaryLayer<op>::run()
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, _impl->src);
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index e72488f0f6..fb75f9da29 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
@@ -36,7 +38,15 @@ namespace arm_compute
NEFFT1D::~NEFFT1D() = default;
NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+ : _memory_group(std::move(memory_manager)),
+ _digit_reverse_kernel(),
+ _fft_kernels(),
+ _scale_kernel(),
+ _digit_reversed_input(),
+ _digit_reverse_indices(),
+ _num_ffts(0),
+ _axis(0),
+ _run_scale(false)
{
}
@@ -44,6 +54,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Decompose size to radix factors
const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
@@ -72,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
_fft_kernels.resize(_num_ffts);
_axis = config.axis;
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -82,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
fft_kernel_info.Nx = Nx;
fft_kernel_info.is_first_stage = (i == 0);
_fft_kernels[i] = std::make_unique<NEFFTRadixStageKernel>();
- _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+ _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+ fft_kernel_info);
Nx *= radix_for_stage;
}
// Configure scale kernel
- if(_run_scale)
+ if (_run_scale)
{
FFTScaleKernelInfo scale_config;
scale_config.scale = static_cast<float>(N);
scale_config.conjugate = config.direction == FFTDirection::Inverse;
_scale_kernel = std::make_unique<NEFFTScaleKernel>();
- is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+ is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+ : _scale_kernel->configure(output, nullptr, scale_config);
}
// Allocate tensors
@@ -111,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
// Check if FFT is decomposable
const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
@@ -120,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
// All combinations are supported except real input with real output (i.e., both input channels set to 1)
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -138,13 +151,13 @@ void NEFFT1D::run()
NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
}
// Run output scaling
- if(_run_scale)
+ if (_run_scale)
{
NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index 3b787cd523..066909221d 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,16 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
NEFFT2D::~NEFFT2D() = default;
NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+ : _memory_group(memory_manager),
+ _first_pass_func(memory_manager),
+ _second_pass_func(memory_manager),
+ _first_pass_tensor()
{
}
@@ -43,6 +45,7 @@ void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Setup first pass
FFT1DInfo first_pass_config;
@@ -79,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 56fc2e4a2b..94f85e5ffa 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -25,14 +25,16 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/utils/helpers/fft.h"
namespace arm_compute
@@ -45,11 +47,11 @@ int pad_decomposable(int N)
int pad = 0;
bool is_decomposed = false;
- while(!is_decomposed)
+ while (!is_decomposed)
{
const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
is_decomposed = !decomposed_vector.empty();
- if(!is_decomposed)
+ if (!is_decomposed)
{
++pad;
}
@@ -101,10 +103,16 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
}
NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void NEFFTConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_original_weights = weights;
_original_bias = biases;
@@ -113,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_has_bias = biases != nullptr;
// Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
- const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
- pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ const Size2D input_dims =
+ Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size =
+ Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
// Tensors to use
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = _has_bias ? &_bias_output : output;
// Permute bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
_permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
_permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -135,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Permute input if needed
_needs_permute = input->info()->data_layout() == DataLayout::NHWC;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -156,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
// Pad weights
- const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
_pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
// Transform weights
@@ -164,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
// Pad input
- const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
_memory_group.manage(&_padded_input);
_pad_input_func.configure(input_to_use, &_padded_input, padding_in);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
}
@@ -191,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_memory_group.manage(&_itransformed_output);
FFT2DInfo itranform_info;
itranform_info.direction = FFTDirection::Inverse;
- _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransformed_output.allocator()->init(
+ _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
_itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
_output_reduced.allocator()->allocate();
@@ -203,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Extract correct region
const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
- const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
- const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
- if(_has_bias)
+ const int end_right =
+ _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton =
+ _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if (_has_bias)
{
_memory_group.manage(&_bias_output);
}
- else if(_needs_permute)
+ else if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
}
- _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+ Coordinates(end_right, end_botton));
_reshaped_output.allocator()->allocate();
_itransformed_output.allocator()->allocate();
// Add bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
@@ -233,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
}
// Permute output
- if(_needs_permute)
+ if (_needs_permute)
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -245,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.configure(output, nullptr, act_info);
}
@@ -258,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
axis_data[1] = 1;
}
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
@@ -277,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
const auto strides = conv_info.stride();
ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+ conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+ conv_info.pad_bottom() != (kernel_size.y() / 2));
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -289,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+ (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
// Validate Activation Layer
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
}
@@ -311,7 +334,7 @@ void NEFFTConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Transform input
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_func.run();
}
@@ -329,17 +352,17 @@ void NEFFTConvolutionLayer::run()
_extract_output_func.run();
// Add bias
- if(_has_bias)
+ if (_has_bias)
{
_bias_add_func.run();
}
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_func.run();
}
// Run activation layer
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.run();
}
@@ -347,10 +370,10 @@ void NEFFTConvolutionLayer::run()
void NEFFTConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Permute bias to NCHW
- if(_original_bias != nullptr)
+ if (_original_bias != nullptr)
{
_permuted_bias.allocator()->allocate();
_permute_bias_func.run();
@@ -360,7 +383,7 @@ void NEFFTConvolutionLayer::prepare()
const ITensor *cur_weights = _original_weights;
// Permute weights
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index ee539fdfc8..bc1d5b7f5c 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEFill.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFill.h"
+
+#include "src/cpu/operators/CpuFill.h"
#include <utility>
@@ -32,15 +33,14 @@ namespace arm_compute
{
struct NEFill::Impl
{
- ITensor *tensor{ nullptr };
- std::unique_ptr<cpu::CpuFill> op{ nullptr };
+ ITensor *tensor{nullptr};
+ std::unique_ptr<cpu::CpuFill> op{nullptr};
};
-NEFill::NEFill()
- : _impl(std::make_unique<Impl>())
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
{
}
-NEFill::NEFill(NEFill &&) = default;
+NEFill::NEFill(NEFill &&) = default;
NEFill &NEFill::operator=(NEFill &&) = default;
NEFill::~NEFill() = default;
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 256aad6d3f..a3ab9c3db4 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -25,17 +25,22 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
namespace arm_compute
{
-NEFillBorder::NEFillBorder()
- : _border_handler(nullptr)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
{
}
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorder::configure(ITensor *input,
+ unsigned int border_width,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
+ ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
_border_handler = std::make_unique<NEFillBorderKernel>();
_border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value);
}
@@ -44,4 +49,4 @@ void NEFillBorder::run()
{
NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 4d1054ad25..56db2be3fa 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -24,25 +24,25 @@
#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/cpu/operators/CpuFlatten.h"
+#include "src/cpu/operators/CpuFlatten.h"
namespace arm_compute
{
struct NEFlattenLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuFlatten> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuFlatten> op{nullptr};
};
-NEFlattenLayer::NEFlattenLayer()
- : _impl(std::make_unique<Impl>())
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
{
}
-NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
NEFlattenLayer::~NEFlattenLayer() = default;
@@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_impl->src = input;
_impl->dst = output;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_flatten_shape(input->info())));
_impl->op = std::make_unique<cpu::CpuFlatten>();
_impl->op->configure(_impl->src->info(), _impl->dst->info());
@@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
}
return cpu::CpuFlatten::validate(input, output);
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index f8a3c13d6d..112c93c478 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -24,22 +24,22 @@
#include "arm_compute/runtime/NEON/functions/NEFloor.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFloor.h"
+
+#include "src/cpu/operators/CpuFloor.h"
namespace arm_compute
{
struct NEFloor::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuFloor> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuFloor> op{nullptr};
};
-NEFloor::NEFloor()
- : _impl(std::make_unique<Impl>())
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
{
}
-NEFloor::NEFloor(NEFloor &&) = default;
+NEFloor::NEFloor(NEFloor &&) = default;
NEFloor &NEFloor::operator=(NEFloor &&) = default;
NEFloor::~NEFloor() = default;
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index f469a0bdab..2656d0fa0f 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,469 +23,138 @@
*/
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include <cmath>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
-namespace
+struct NEFullyConnectedLayer::Impl
{
-// Get min, max bound of a quantized assymetric output tensor, with the effect of fused activation
-std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
-{
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- const UniformQuantizationInfo q_unif = q_info.uniform();
-
- if(act_info.enabled())
- {
- switch(act_info.activation())
- {
- case ActivationLayerInfo::ActivationFunction::RELU:
- type_min = PixelValue(q_unif.offset);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- type_min = PixelValue(q_unif.offset);
- type_max = PixelValue(act_info.a(), data_type, q_info);
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- type_min = PixelValue(act_info.b(), data_type, q_info);
- type_max = PixelValue(act_info.a(), data_type, q_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Activation function not supported.");
- break;
- }
- }
-
- return std::make_pair(type_min, type_max);
-}
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
-Status get_gemmlowp_output_stage_info(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
-{
- const auto data_type = input->data_type();
- const QuantizationInfo oq_info = output->quantization_info();
- const UniformQuantizationInfo iq_unif = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_unif = oq_info.uniform();
-
- float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
- int32_t output_multiplier;
- int32_t output_shift;
-
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
+ std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
- gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
- gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
- gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
- gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>();
- gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>();
+ const ITensor *original_weights{nullptr};
- return Status{};
-}
-
-Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act)
-{
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info(input->quantization_info().uniform().scale, -input->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
- GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(input, weights, output, act, gemmlowp_output_stage_info));
-
- GEMMInfo gemm_info;
- gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-
- // Validate gemmlowp function
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_quantization_info(input_quantization_info),
- &weights->clone()->set_quantization_info(weights_quantization_info),
- biases,
- output,
- gemm_info));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(input, weights, biases, output, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
- }
-
- return Status{};
-}
-} // namespace
+ bool is_prepared{false};
+ bool dynamic_weights{false};
+};
NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
- _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(),
- _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false)
-{
-}
-
-void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
- if(_is_quantized_asymmetric)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info = input->info()->quantization_info();
- const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
- // Configure gemmlowp function and output stage for asymmetric quantized types
- GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- const Status status = get_gemmlowp_output_stage_info(input->info(), weights->info(), output->info(), act, gemmlowp_output_stage_info);
- ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
-
- GEMMInfo gemm_info;
- gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
- gemm_info.set_activation_info(act);
- _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
-
- // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
- input->info()->set_quantization_info(input_quantization_info);
- weights->info()->set_quantization_info(weights_quantization_info);
- }
- else
- {
- // Configure matrix multiply kernel
- GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
- gemm_info.set_activation_info(act);
- _mm_gemm.configure(input, weights, biases, output, 1.f, 1.0f, gemm_info);
- }
-}
-
-void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for flatten
- TensorShape shape_flatten = compute_flatten_shape(input->info());
- _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
- // Configure flatten kernel
- _memory_group.manage(&_flatten_output);
-
- _flatten.configure(input, &_flatten_output);
-
- // Configure matrix multiply kernel
- configure_mm(&_flatten_output, weights, biases, output, act);
-
- // Allocate the output tensor for flatten once all the configure methods have been called
- _flatten_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
- // Configure matrix multiply kernel
- configure_mm(input, weights, biases, output, act);
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+ _impl->weights_manager = weights_manager;
}
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
- FullyConnectedLayerInfo fc_info)
+void NEFullyConnectedLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
- weights->info(),
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
biases != nullptr ? biases->info() : nullptr,
- output->info(),
- fc_info));
+ output->info(), fc_info, weights_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
- _are_weights_converted = true;
- _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- _is_fc_after_conv = true;
- _is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
- _original_weights = weights;
+ _impl->op = std::make_unique<cpu::CpuFullyConnected>();
+ _impl->original_weights = weights;
+ _impl->is_prepared = false;
- if(_weights_manager)
- {
- _weights_manager->manage(weights);
- }
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- const ITensor *weights_to_use = weights;
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
- if(is_batched_fc_layer)
- {
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
- }
- else
- {
- _is_fc_after_conv = input->info()->num_dimensions() > 1;
- }
-
- // Reshape weights if needed
- if(!_are_weights_reshaped)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed_function.configure(weights);
- weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function);
- }
- else
- {
- // Reshape the weights
- _reshape_weights_function.configure(weights, &_reshape_weights_output);
- weights_to_use = &_reshape_weights_output;
- }
- }
+ _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+ fc_info, weights_info);
- // Convert weights if needed
- if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+ if (_impl->weights_manager != nullptr)
{
- if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
- {
- _convert_weights_managed.configure(weights_to_use,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
- weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed);
- }
- else
- {
- // Convert weights
- _convert_weights.configure(weights_to_use,
- &_converted_weights_output,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
-
- weights_to_use = &_converted_weights_output;
- }
- _are_weights_converted = false;
+ _impl->weights_manager->manage(_impl->original_weights);
}
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(input, weights_to_use, biases, output, fc_info.activation_info);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(input, weights_to_use, biases, output, fc_info.activation_info);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
- _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+ _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+ !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
}
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- FullyConnectedLayerInfo fc_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const FullyConnectedLayerInfo &fc_info,
+ const WeightsInfo &weights_info)
{
- ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
- && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-
- bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- bool is_fc_after_conv = true;
-
- const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
- const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
- const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- const ITensorInfo *input_to_use = input;
- const ITensorInfo *weights_to_use = weights;
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->dimension(1) > 1;
-
- if(is_batched_fc_layer)
- {
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
- input->tensor_shape().cend(),
- output->tensor_shape().cbegin() + 1));
- }
- else
- {
- is_fc_after_conv = input->num_dimensions() > 1;
- }
-
- if(!weights_reshaped)
- {
- // Validate reshape weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights, &reshaped_weights));
- weights_to_use = &reshaped_weights;
- }
-
- if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
- {
- // Validate convert weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
- &converted_weights,
- input->tensor_shape(),
- fc_info.weights_trained_layout));
- weights_to_use = &converted_weights;
- }
-
- if(is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
- // Validate flatten kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
- input_to_use = &flatten_input;
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
- }
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(input_to_use, weights_to_use, biases, output, fc_info.activation_info));
+ return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+ weights_info);
+}
- return Status{};
+Status NEFullyConnectedLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
+{
+ return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
}
void NEFullyConnectedLayer::run()
{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Linearize input if it comes from a convolutional layer
- if(_is_fc_after_conv)
+ if (!_impl->dynamic_weights)
{
- _flatten.run();
+ prepare();
}
- // Run matrix multiply
- if(_is_quantized_asymmetric)
- {
- _mm_gemmlowp.run();
- }
- else
- {
- _mm_gemm.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEFullyConnectedLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(!_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- }
-
- auto release_unused = [](Tensor * w)
- {
- if(!w->is_used())
- {
- w->allocator()->free();
- }
- };
+ _impl->op->prepare(_impl->run_pack);
- // Pointer to current weights
- const ITensor *cur_weights = _original_weights;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
+ // Handle weights managed infrastructure
+ if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+ // This is for cases where multiple functions share the same b (weights)
+ // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+ const ITensor *original_b = _impl->original_weights;
+ if (!original_b->is_used())
{
- cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function);
+ _impl->weights_manager->pre_mark_as_unused(original_b);
}
- else
- {
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- // Run reshape weights kernel and mark weights as unused
- _reshape_weights_output.allocator()->allocate();
- _reshape_weights_function.run();
- }
- cur_weights->mark_as_unused();
- cur_weights = &_reshape_weights_output;
- }
- _are_weights_reshaped = true;
+ _impl->original_weights->mark_as_used();
+ _impl->weights_manager->release(_impl->original_weights);
}
-
- // Convert weights if needed (happens only once)
- if(!_are_weights_converted)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
- {
- _weights_manager->run(cur_weights, &_convert_weights_managed);
- }
- else
- {
- _converted_weights_output.allocator()->allocate();
- _convert_weights.run();
- cur_weights->mark_as_unused();
- }
-
- _are_weights_converted = true;
- }
-
- // Release reshaped weights if unused
- release_unused(&_reshape_weights_output);
-
- // Prepare GEMM prepare and release unused weights
- if(!_is_quantized_asymmetric)
- {
- _mm_gemm.prepare();
- }
-
- // Release converted weights if unused
- release_unused(&_reshape_weights_output);
- release_unused(&_converted_weights_output);
-
- _is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index a8ce6b2bfc..f5b8b57dac 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,32 +28,50 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
namespace arm_compute
{
NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
-NEFuseBatchNormalization::NEFuseBatchNormalization()
- : _fuse_bn_kernel()
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
{
}
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
- ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
+ ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
+
_fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
- _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
}
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
void NEFuseBatchNormalization::run()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 7318c3e492..934a8250cc 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,382 +23,140 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
-#include <cmath>
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemm.h"
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
namespace arm_compute
{
-namespace
+struct NEGEMM::Impl
{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
-{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
- return asm_info;
-}
-} // namespace
+ std::unique_ptr<cpu::CpuGemm> op{nullptr};
+
+ const ITensor *original_b{nullptr};
+ bool is_prepared{false};
+
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+};
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(),
- _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
- _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+ _impl->weights_manager = weights_manager;
}
NEGEMM::~NEGEMM() = default;
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void NEGEMM::configure(const ITensor *a,
+ const ITensor *b,
+ const ITensor *c,
+ ITensor *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
-
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
- bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+ d->info(), alpha, beta, gemm_info));
// Check if we need to reshape the matrix B only on the first run
- _is_prepared = false;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _original_b = b;
- _run_alpha_scale = alpha != 1.f;
- _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run();
- _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
- _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+ _impl->is_prepared = false;
+ _impl->original_b = b;
+ _impl->op = std::make_unique<cpu::CpuGemm>();
- if(run_optimised)
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->info()->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- const ITensor *c_to_use = is_c_bias ? c : nullptr;
- const ITensorInfo *c_info_to_use = c_to_use != nullptr ? c_to_use->info() : nullptr;
- _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info);
- ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
-
- _asm_glue_tensors =
- {
- { ACL_SRC_0, a },
- { ACL_SRC_1, b },
- { ACL_SRC_2, c_to_use },
- { ACL_DST, d },
- };
-
- // Scale product by alpha
- if(_run_alpha_scale)
- {
- _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
- }
+ b_info_to_use->set_are_values_constant(false);
}
- else
- {
- // Pick output tensor in case bias addition should be performed
- ITensor *gemm_output_to_use = d;
- if(_run_bias_addition)
- {
- gemm_output_to_use = &_tmp_d;
- _memory_group.manage(&_tmp_d);
- }
-
- _mm_kernel = std::make_unique<NEGEMMMatrixMultiplyKernel>();
-
- // Select between GEMV and GEMM
- if(_run_vector_matrix_multiplication)
- {
- // Configure the matrix multiply kernel
- _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
- }
- else
- {
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
-
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
- const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
- shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
- TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
- TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
-
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
- int m = a->info()->dimension(1);
- int n = b->info()->dimension(0);
- int k = a->info()->dimension(0);
+ _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+ gemm_info);
- // Configure interleave kernel
- _interleave_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
- _interleave_kernel->configure(a, &_tmp_a);
-
- // Configure transpose kernel
- _transpose_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
- _transpose_kernel->configure(b, &_tmp_b);
-
- // Configure matrix multiplication kernel
- _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
-
- // Allocate once the all configure methods have been called
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if(_run_bias_addition)
- {
- _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
- _tmp_d.allocator()->allocate();
- }
- }
-
- // Configure matrix addition kernel
- if(_run_addition)
- {
- _ma_kernel = std::make_unique<NEGEMMMatrixAdditionKernel>();
- _ma_kernel->configure(c, d, beta);
- }
-
- // Configure activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(_run_activation)
- {
- _activation_func.configure(d, nullptr, activation);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+ _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_UNUSED(alpha);
- const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
-
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- if(a->data_type() != DataType::BFLOAT16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
- }
-
- if(c != nullptr && !is_c_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
- {
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
-
- // Check if we need to run the optimized assembly kernel
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
-
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
- // Check if the first input tensor is a vector.
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- // Check if we need to reshape the matrix A and matrix B
- const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- const int m = a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
-
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
-
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo tmp_output_info = *output->clone();
-
- if(run_interleave_transpose)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
-
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
- }
-
- // Validate matrix multiply
- auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
-
- if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
- }
- }
-
- // Validate matrix addition kernel
- if(beta != 0 && c != nullptr && !is_c_bias)
+ // Make the B matrix dynamic values.
+ auto b_to_use = b->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
+ b_to_use->set_are_values_constant(false);
}
- // Validate activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
- }
+ return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
+}
- return Status{};
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha, beta);
+ return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
}
void NEGEMM::run()
{
prepare();
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_asm_glue->is_configured())
- {
- _asm_glue->run(_asm_glue_tensors);
- if(_run_alpha_scale)
- {
- _alpha_scale_func.run();
- }
- }
- else
- {
- if(!_run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- NEScheduler::get().schedule(_interleave_kernel.get(), Window::DimY);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
- }
- }
-
- NEScheduler::get().schedule(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
- // Run bias addition kernel
- if(_run_bias_addition)
- {
- _add_bias.run();
- }
- }
-
- // Run matrix addition kernel
- if(_run_addition)
- {
- NEScheduler::get().schedule(_ma_kernel.get(), Window::DimY);
- }
-
- // Run activation function
- if(_run_activation)
- {
- _activation_func.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMM::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
- if(_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
+ _impl->op->prepare(_impl->prep_pack);
+
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- _asm_glue->prepare(_asm_glue_tensors);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
+ if (has_reshape != std::end(_impl->aux_mem_req))
+ {
+ _impl->original_b->mark_as_unused();
}
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ else
{
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
-
- _tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
+ _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b);
}
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 564ce2f514..6cca02eea9 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -24,50 +24,93 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
+#include "arm_compute/runtime/Tensor.h"
-#include <set>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
namespace arm_compute
{
using OperatorType = cpu::CpuGemmDirectConv2d;
+using namespace arm_compute::experimental;
struct NEGEMMConv2d::Impl
{
- ITensorPack tensors{};
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ITensor *weights{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ MemoryGroup memory_group{};
+ bool is_prepared{false};
+ experimental::MemoryRequirements aux_mem_req{};
};
-NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _impl(std::make_unique<Impl>())
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
{
- _impl->op = std::make_unique<OperatorType>(memory_manager);
+ _impl->memory_group = MemoryGroup(memory_manager);
}
NEGEMMConv2d::~NEGEMMConv2d() = default;
-void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+void NEGEMMConv2d::configure(
+ ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
{
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input);
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights);
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases);
- _impl->tensors.add_tensor(TensorType::ACL_DST, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+ _impl->weights = weights;
+ _impl->is_prepared = false;
+ _impl->op = std::make_unique<OperatorType>();
+
+ _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ info);
+
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv2dInfo &info)
{
return OperatorType::validate(input, weights, biases, output, info);
}
+
void NEGEMMConv2d::run()
{
- _impl->op->run(_impl->tensors);
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
+
void NEGEMMConv2d::prepare()
{
- _impl->op->prepare(_impl->tensors);
+ if (!_impl->is_prepared)
+ {
+ _impl->op->prepare(_impl->prep_pack);
+
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+ if (has_reshape != std::end(_impl->aux_mem_req))
+ {
+ _impl->weights->mark_as_unused();
+ }
+ else
+ {
+ _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
+ }
+
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 2876c254fa..c8f65d2fd9 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,617 +26,109 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
-#include <set>
-#include <tuple>
+using namespace arm_compute::experimental;
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
-
-NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() noexcept
- : _weights_reshape_kernel()
-{
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info()));
- const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
-
- _weights_reshape_kernel = std::make_unique<NEWeightsReshapeKernel>();
- _weights_reshape_kernel->configure(weights, biases_to_use, output);
-
- output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
+struct NEGEMMConvolutionLayer::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
- DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
- NEWeightsReshapeKernel::validate(weights, biases, output);
- }
-
- return Status{};
-}
-
-void NEConvolutionLayerReshapeWeights::run()
+ const ITensor *weights{nullptr};
+ std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
+ ITensorPack run_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{false};
+};
+
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
+ _impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-
NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
- _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
- _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false)
+void NEGEMMConvolutionLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
-}
-
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
- act_info, gemm_3d_depth, _skip_im2col));
-
- // Create GEMMInfo structure
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- // Supported activations in GEMM
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(_is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo iqinfo = input->info()->quantization_info();
- const QuantizationInfo wqinfo = weights->info()->quantization_info();
- const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
- const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
- const DataType data_type = input->info()->data_type();
-
- input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
- if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
- {
- const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
- weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
- }
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- if(supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
- quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
- _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
-
- // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(iqinfo);
- weights->info()->set_quantization_info(wqinfo);
- }
- else
- {
- // Configure matrix multiply function
- _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
- const DataType data_type = input->data_type();
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_activation_enabled = act_info.enabled();
-
- // Create GEMMInfo structure
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- if(is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo &iqinfo = input->quantization_info();
- const QuantizationInfo &wqinfo = weights->quantization_info();
- const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
- // Perform validation step on GEMMLowp
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
- }
- else
- {
- // Perform validation step on Matrix multiply function
- return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
- const DataType data_type = input_info->data_type();
- const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
- const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
-
- // Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
- const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+ _impl->weights = weights;
+ _impl->op = std::make_unique<cpu::CpuGemmConv2d>();
+ _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+ conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input},
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
+ {TensorType::ACL_DST, output}};
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(num_groups, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- weights_info,
- dilation,
- act_info,
- num_groups));
-
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->info()->dimension(idx_width);
- const unsigned int kernel_height = weights->info()->dimension(idx_height);
-
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _original_output = output;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- const ITensor *gemm_input_to_use = input;
- ITensor *gemm_output_to_use = output;
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
- input->info()->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- if(data_layout == DataLayout::NHWC)
- {
- _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!_skip_col2im)
- {
- _skip_im2col = false;
- }
- }
- else
- {
- _skip_col2im = false;
- }
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-
- // _weights_reshaped will be auto configured in the kernel.
- // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
- const ITensor *weights_to_use = weights;
-
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed.configure(weights, nullptr);
- weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
- }
- else
- {
- _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
- weights_to_use = &_weights_reshaped;
- }
-
- // Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
- {
- _memory_group.manage(&_im2col_output);
-
- // Configure
- _im2col_kernel = std::make_unique<NEIm2ColKernel>();
- _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
- // Update GEMM input
- gemm_input_to_use = &_im2col_output;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!_skip_col2im)
- {
- TensorShape shape_gemm;
-
- // Calculate GEMM output shape
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo info_gemm(shape_gemm, 1, output_data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
- _gemm_output.allocator()->init(info_gemm);
- _gemm_output_3d.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output;
- }
- else
- {
- TensorInfo out_info{ *output->info() };
- out_info.set_data_type(output_data_type).set_data_layout(input->info()->data_layout()).set_is_resizable(true);
- _gemm_output.allocator()->init(out_info);
- _gemm_output_3d.allocator()->init(out_info);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output_3d;
- }
-
- // Configure GEMM
- // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
- const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
-
- if(!_skip_im2col)
- {
- _im2col_output.allocator()->allocate();
- }
-
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- // Configure col2im
- _col2im_kernel = std::make_unique<NECol2ImKernel>();
- _col2im_kernel->configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
-
- if(_is_quantized && !_skip_col2im)
- {
- _tmp_output.allocator()->allocate();
- }
-
- _gemm_output.allocator()->allocate();
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
- "Output shape does not match the expected one");
+ return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
}
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ const bool enable_fast_math)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
-
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->dimension(idx_width);
- const unsigned int kernel_height = weights->dimension(idx_height);
-
- TensorInfo im2col_reshaped_info{};
- TensorInfo info_gemm{};
- TensorInfo tmp_info{};
- TensorInfo weights_reshaped_info{};
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *weights_to_use = weights;
-
- const bool append_bias = false;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_bf16 = data_type == DataType::BFLOAT16;
- bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- bool skip_col2im = false;
- if(data_layout == DataLayout::NHWC)
- {
- skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!skip_col2im)
- {
- skip_im2col = false;
- }
- }
-
- if(skip_col2im)
- {
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
- {
- skip_im2col = false;
- skip_col2im = false;
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- // Validate biases
- if(biases != nullptr)
- {
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else if(is_bf16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
- // Output tensor auto inizialization if not yet initialized
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
- weights_reshaped_info.set_quantization_info(weights->quantization_info());
- weights_to_use = &weights_reshaped_info;
-
- if(!skip_im2col)
- {
- // Create tensor info for im2col reshaped inputs
- // For CPU, the batch size is on the fourth dimension
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- shape_im2col.set(2, 1);
-
- im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
- gemm_input_to_use = &im2col_reshaped_info;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!skip_col2im)
- {
- TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
- info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
- }
- else
- {
- info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type);
- }
- info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
- gemm_output_to_use = &info_gemm;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
-
- // Validate Col2Im/ReshapeLayer
- if(!skip_col2im && (data_layout == DataLayout::NCHW))
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
- }
-
- return Status{};
+ return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+ dilation, act_info, enable_fast_math);
}
void NEGEMMConvolutionLayer::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- bool out_has_padding = _skip_col2im && (_original_output->info()->padding().bottom != 0 || _original_output->info()->padding().top != 0);
-
- if(!_skip_im2col)
- {
- // Run input reshaping
- unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- NEScheduler::get().schedule(_im2col_kernel.get(), y_dim);
- }
-
- // Handle the case where output has top/bottom padding
- const ITensor *out_to_use = out_has_padding ? &_gemm_output : _original_output;
- _gemm_output_3d.info()->extend_padding(out_to_use->info()->padding());
- _gemm_output_3d.allocator()->import_memory(out_to_use->buffer());
-
- // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
- if(_is_quantized)
- {
- // Run gemmlowp
- _mm_gemmlowp.run();
- }
- else
- {
- // Run gemm
- _mm_gemm.run();
- }
-
- // Reshape output matrix
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- NEScheduler::get().schedule(_col2im_kernel.get(), Window::DimY);
- }
- else
- {
- _reshape_layer.run();
- }
- }
- else if(out_has_padding)
- {
- _reshape_layer.run();
- }
-
- _gemm_output_3d.allocator()->free();
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
- {
- _weights_manager->run(_original_weights, &_reshape_weights_managed);
- }
- else
- {
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
- }
-
- // Prepare GEMM
- _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
- if(!_weights_reshaped.is_used())
- {
- _weights_reshaped.allocator()->free();
- }
+ _impl->op->prepare(_impl->run_pack);
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index cc0f20e695..44bfc6a51e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,604 +23,109 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+using namespace arm_compute::experimental;
namespace arm_compute
{
-namespace
+struct NEGEMMLowpMatrixMultiplyCore::Impl
{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+ const ITensor *b{nullptr};
+ std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{false};
+};
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
- asm_info.output_stage = info.gemmlowp_output_stage();
-
- return asm_info;
+ _impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-} // namespace
-
-using namespace arm_compute::misc::shape_calculator;
-
NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
- _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(),
- _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
- _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
- _run_activation(false), _flip_signedness(false)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(c);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- const ITensor *matrix_a = a;
- const ITensor *matrix_b = b;
- GEMMInfo info = gemm_info;
-
- // Set internal variables
- _a_offset = a->info()->quantization_info().uniform().offset;
- _b_offset = b->info()->quantization_info().uniform().offset;
- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
- _is_prepared = false;
- _fused_assembly_path = false;
- _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
- _original_b = b;
-
- const ITensor *a_to_use = a;
-
- // Convert to QASYMM8 -> QASYMM8_SIGNED and back
- if(_flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
-
- _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
- _memory_group.manage(&_signed_a);
- _convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
- a_to_use = &_signed_a;
- _a_offset = _signed_a.info()->quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
- _memory_group.manage(&_signed_output);
- _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a = &_signed_a;
- }
-
- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- _fuse_output_stage = true;
- _memory_group.manage(&_mm_result_s32);
- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
- _mm_result_s32.allocator()->init(info_mm_result_s32);
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-#ifdef __aarch64__
- switch(a->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- case DataType::U8:
- case DataType::S8:
- {
- if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- auto c_info_to_use = c == nullptr ? nullptr : c->info();
- _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
- _fused_assembly_path = _asm_glue->is_configured();
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
- _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output);
- }
- else
- {
- auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output);
- _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
- _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
- }
- _assembly_path = _asm_glue->is_configured();
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Datatype not supported");
- break;
- }
- }
-#endif /* __aarch64__ */
- if(!(_assembly_path || _run_vector_matrix_multiplication))
- {
- matrix_a = &_tmp_a;
- matrix_b = &_tmp_b;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
- _tmp_a.allocator()->init(a_info);
- _tmp_b.allocator()->init(b_info);
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
-
- // Configure interleave kernel
- _mtx_a_reshape_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
- _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
-
- // Configure transpose kernel
- _mtx_b_reshape_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
- _mtx_b_reshape_kernel->configure(b, &_tmp_b);
- }
-
- if(!_fused_assembly_path)
- {
- // Build reduction info
- const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
-
- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
- {
- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
- _vector_sum_col.allocator()->init(info_vector_sum_col);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_vector_sum_col);
- }
-
- // Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>();
- _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
- }
-
- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
- _vector_sum_row.allocator()->init(info_vector_sum_row);
- _memory_group.manage(&_vector_sum_row);
-
- // Configure matrix A reduction kernel
- _mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
- }
-
- if(_fuse_output_stage)
- {
- // Configure matrix multiply kernel
- if(!_assembly_path)
- {
- _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
- }
-
- _offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
- _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
- _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : output,
- a->info()->dimension(0),
- _a_offset, _b_offset, info.gemmlowp_output_stage());
-
- if(_flip_signedness)
- {
- _convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _convert_from_signed_asymm->configure(&_signed_output, output);
- }
- }
- else
- {
- // Configure matrix multiply kernel
- if(!_assembly_path)
- {
- _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _mm_kernel->configure(matrix_a, matrix_b, output);
- }
- // Configure offset contribution kernel
- _offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>();
- _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
- }
- }
- // Configure activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
- if(_run_activation)
- {
- _activation_func.configure(output, nullptr, activation);
- }
-
- // Allocate tensors
- if(!_assembly_path && !_run_vector_matrix_multiplication)
- {
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if(!_fused_assembly_path)
- {
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- }
-
- if(_b_offset != 0)
- {
- _vector_sum_row.allocator()->allocate();
- }
- }
- if(_fuse_output_stage)
- {
- _mm_result_s32.allocator()->allocate();
- }
-
- if(_flip_signedness)
- {
- _signed_a.allocator()->allocate();
- _signed_output.allocator()->allocate();
- }
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->info()->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
+ {
+ b_info_to_use->set_are_values_constant(false);
+ }
+
+ _impl->b = b;
+ _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+ _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+ gemm_info);
+ _impl->run_pack = {{TensorType::ACL_SRC_0, a},
+ {TensorType::ACL_SRC_1, b},
+ {TensorType::ACL_SRC_2, c},
+ {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- GEMMInfo info = gemm_info;
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- const ITensorInfo *a_to_use = a;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo mm_result_s32_info{};
-
- int32_t a_offset = a->quantization_info().uniform().offset;
- int32_t b_offset = b->quantization_info().uniform().offset;
-
- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if(fuse_output_stage)
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ b_info_to_use->set_are_values_constant(false);
}
- // Convert QASYMM8->QASYMM8_SIGNED
- TensorInfo signed_a{};
- TensorInfo signed_output{};
- bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
- if(flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
-
- signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
- a_to_use = &signed_a;
- a_offset = signed_a.quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
- signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a_info = &signed_a;
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(info);
-
- // Check if we need to run the optimized assembly kernel
- bool run_optimised = false;
- bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
- run_optimised_requantized = run_optimised;
- }
- else
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
- }
-
- if(run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(info.depth_output_gemm3d() != 0)
- {
- if(info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if(!run_vector_matrix_multiplication)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
- }
- }
-
- if(!run_optimised_requantized)
- {
- TensorInfo info_vector_sum_col{};
- TensorInfo info_vector_sum_row{};
-
- const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
- {
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
- // Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
- }
-
- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
- {
- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
- // Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
- }
-
- if(fuse_output_stage)
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
- }
-
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- flip_signedness ? &signed_output : output,
- a_offset, b_offset,
- info.gemmlowp_output_stage()));
- }
- else
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
- }
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
- }
- }
-
- // Validate activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
- }
-
- return Status{};
+ return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info);
}
void NEGEMMLowpMatrixMultiplyCore::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Convert QASYMM8->QASYMM8_SIGNED
- if(_flip_signedness)
- {
- NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
- }
-
- // Run GEMM
- if(_asm_glue->is_configured())
- {
- _asm_glue->run(_asm_glue_tensors);
- }
- else
- {
- if(!_run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
- }
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
- }
-
- if(!_fused_assembly_path)
- {
- // Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
- }
-
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
- }
-
- if(_fuse_output_stage)
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
- }
- else
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
- }
- }
-
- // Convert QASYMM8_SIGNED->QASYMM8
- if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
- {
- NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
- }
-
- // Run fused activation unless already run in the fused assembly
- if(_run_activation)
- {
- _activation_func.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
- // Run assembly reshape
- if(_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
+ _impl->op->prepare(_impl->prep_pack);
- _asm_glue->prepare(_asm_glue_tensors);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
- }
- // Run non-assembly reshape
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
-
- // Run reshape kernel and mark original weights tensor as unused
- _tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
- }
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
- _vector_sum_col.allocator()->allocate();
- NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
+ _impl->b->mark_as_unused();
}
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 807785a534..8178003b5e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,162 +25,54 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-namespace arm_compute
-{
-NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
-{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
- _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
- return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
-{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
- _kernel = std::move(k);
-}
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
-Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+namespace arm_compute
{
- return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
+struct NEGEMMLowpOutputStage::Impl
{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
- _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+ const ITensor *src{nullptr};
+ const ITensor *bias{nullptr};
+ ITensor *dst{nullptr};
+ ITensorPack run_pack{};
+ std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
+};
+
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
{
- return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
}
-
NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor *input,
+ const ITensor *bias,
+ ITensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
-
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- {
- switch(info.output_data_type)
- {
- case DataType::QASYMM8:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- case DataType::QASYMM8_SIGNED:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- case DataType::QSYMM16:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported output data type.");
- break;
- }
- }
- break;
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- {
- switch(info.output_data_type)
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>();
- k->configure(input, bias, output, &info);
- _kernel = std::move(k);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported output data type.");
- break;
- }
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
- }
+ ARM_COMPUTE_ERROR_THROW_ON(
+ NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+ _impl->src = input;
+ _impl->bias = bias;
+ _impl->dst = output;
+ _impl->op = std::make_unique<cpu::CpuGemmLowpOutputStage>();
+ _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
+
+ _impl->run_pack = {
+ {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
}
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::UNKNOWN, "NEGEMMLowpQuantizeDownScaleByFixedPoint cannot be used with UNKNOWN output data type.");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
- ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+ return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
+}
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- {
- switch(output->data_type())
- {
- case DataType::QASYMM8:
- return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- case DataType::QASYMM8_SIGNED:
- return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- case DataType::QSYMM16:
- return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
- }
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- {
- switch(output->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
- }
- }
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
- }
+void NEGEMMLowpOutputStage::run()
+{
+ _impl->op->run(_impl->run_pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index 86cbfd187a..62b8cfa48b 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGather.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEGatherKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ namespace arm_compute
{
void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
auto k = std::make_unique<NEGatherKernel>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 931fdb22f7..1022b4153e 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,10 +25,12 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
namespace arm_compute
{
@@ -67,41 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+void NEGenerateProposalsLayer::configure(const ITensor *scores,
+ const ITensor *deltas,
+ const ITensor *anchors,
+ ITensor *proposals,
+ ITensor *scores_out,
+ ITensor *num_valid_proposals,
const GenerateProposalsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+ proposals->info(), scores_out->info(),
+ num_valid_proposals->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
_is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType scores_data_type = scores->info()->data_type();
_is_qasymm8 = scores_data_type == DataType::QASYMM8;
- const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
- const int total_num_anchors = num_anchors * feat_width * feat_height;
- const int pre_nms_topN = info.pre_nms_topN();
- const int post_nms_topN = info.post_nms_topN();
- const size_t values_per_roi = info.values_per_roi();
+ const int num_anchors = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
- const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+ const QuantizationInfo rois_qinfo =
+ (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
// Compute all the anchors
_memory_group.manage(&_all_anchors);
_compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
- _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+ _compute_anchors->configure(anchors, &_all_anchors,
+ ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
- _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+ _deltas_flattened.allocator()->init(
+ TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
// Permute and reshape deltas
_memory_group.manage(&_deltas_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
_flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
@@ -115,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
// Permute and reshape scores
_memory_group.manage(&_scores_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
_flatten_scores.configure(&_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
@@ -129,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
Tensor *anchors_to_use = &_all_anchors;
Tensor *deltas_to_use = &_deltas_flattened;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
_deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -152,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
anchors_to_use->allocator()->allocate();
_all_proposals_to_use = &_all_proposals;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_memory_group.manage(&_all_proposals_quantized);
// Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
- _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _all_proposals_quantized.allocator()->init(
+ TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
_quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
_all_proposals.allocator()->allocate();
_all_proposals_to_use = &_all_proposals_quantized;
@@ -172,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
// Note that NMS needs outputs preinitialized.
auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
- auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+ rois_qinfo);
auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
// Initialize temporaries (unused) outputs
@@ -185,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
_memory_group.manage(&_proposals_4_roi_values);
- const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
- _cpp_nms.configure(&_scores_flattened /*scores_in*/,
- _all_proposals_to_use /*boxes_in,*/,
- nullptr /* batch_splits_in*/,
- scores_out /* scores_out*/,
- &_proposals_4_roi_values /*boxes_out*/,
- &_classes_nms_unused /*classes*/,
- nullptr /*batch_splits_out*/,
- &_keeps_nms_unused /*keeps*/,
- num_valid_proposals /* keeps_size*/,
- box_nms_info);
+ const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+ true, min_size_scaled, info.im_width(), info.im_height());
+ _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+ nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+ &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+ num_valid_proposals /* keeps_size*/, box_nms_info);
_keeps_nms_unused.allocator()->allocate();
_classes_nms_unused.allocator()->allocate();
@@ -203,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
_proposals_4_roi_values.allocator()->allocate();
}
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
- const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
- const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+ const int num_anchors =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -227,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
}
- TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
- TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- if(scores->data_layout() == DataLayout::NHWC)
+ TensorInfo all_anchors_info(
+ anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+ anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info =
+ deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+ .set_is_resizable(true);
+ TensorInfo scores_permuted_info =
+ scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if (scores->data_layout() == DataLayout::NHWC)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
}
- TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo deltas_flattened_info(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
- TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo scores_flattened_info(
+ scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
- TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
- if(is_qasymm8)
+ TensorInfo proposals_4_roi_values_quantized(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+ .set_quantization_info(QuantizationInfo(0.125f, 0));
+ if (is_qasymm8)
{
- TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ TensorInfo all_anchors_f32_info(anchors->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
- TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
- TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ TensorInfo deltas_flattened_f32_info(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+ &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
- if(num_valid_proposals->total_size() > 0)
+ if (num_valid_proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
}
- if(proposals->total_size() > 0)
+ if (proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
}
}
- if(scores_out->total_size() > 0)
+ if (scores_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -328,7 +373,7 @@ void NEGenerateProposalsLayer::run()
NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
// Transpose and reshape the inputs
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_permute_deltas.run();
_permute_scores.run();
@@ -337,7 +382,7 @@ void NEGenerateProposalsLayer::run()
_flatten_deltas.run();
_flatten_scores.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_dequantize_anchors.run();
_dequantize_deltas.run();
@@ -346,7 +391,7 @@ void NEGenerateProposalsLayer::run()
// Build the boxes
_bounding_box.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_quantize_all_proposals.run();
}
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 5965b9722f..78218cbdee 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
namespace arm_compute
@@ -33,21 +35,29 @@ namespace arm_compute
NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+ : _memory_group(std::move(memory_manager)),
+ _normalization_kernel(),
+ _is_nchw(false),
+ _permute_input(),
+ _permute_output(),
+ _permuted_input(),
+ _permuted_output()
{
}
void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
+
const DataLayout data_layout = input->info()->data_layout();
- const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
+ const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
// Configure Kernels
_is_nchw = data_layout == DataLayout::NCHW;
_normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
- if(!_is_nchw)
+ if (!_is_nchw)
{
_memory_group.manage(&_permuted_input);
_memory_group.manage(&_permuted_output);
@@ -69,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
}
}
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+ const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
{
- return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
- &output->clone()->set_data_layout(DataLayout::NCHW),
- InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
+ return NEInstanceNormalizationLayerKernel::validate(
+ &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+ InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
}
void NEInstanceNormalizationLayer::run()
@@ -81,7 +92,7 @@ void NEInstanceNormalizationLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Permute input
- if(!_is_nchw)
+ if (!_is_nchw)
{
_permute_input.run();
}
@@ -89,7 +100,7 @@ void NEInstanceNormalizationLayer::run()
NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
// Permute output
- if(!_is_nchw)
+ if (!_is_nchw)
{
_permute_output.run();
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 505ee0a962..b7f6203efd 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -43,6 +45,8 @@ NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma
void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
+
// Manage intermediate buffers
_memory_group.manage(&_sumsq);
@@ -66,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
sum_sq.set_tensor_shape(shape);
const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
// Reduce shape on axis
shape.set(actual_axis, 1);
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index d338e4fd2d..1a08cdeb06 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,20 +24,13 @@
#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/common/LSTMParams.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
@@ -47,35 +40,122 @@ using namespace arm_compute::utils::info_helpers;
NELSTMLayer::~NELSTMLayer() = default;
NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
- _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
- _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
- _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
- _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
- _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
- _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
- _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
- _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
- _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
- _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+ : _memory_group(std::move(memory_manager)),
+ _fully_connected_input_gate(),
+ _accum_input_gate1(),
+ _subtract_input_gate(),
+ _pixelwise_mul_input_gate(),
+ _activation_input_gate(),
+ _fully_connected_forget_gate(),
+ _accum_forget_gate1(),
+ _pixelwise_mul_forget_gate(),
+ _activation_forget_gate(),
+ _fully_connected_cell_state(),
+ _gemm_cell_state1(),
+ _transpose_cell_state(),
+ _accum_cell_state1(),
+ _accum_cell_state2(),
+ _pixelwise_mul_cell_state1(),
+ _activation_cell_state(),
+ _cell_clip(),
+ _pixelwise_mul_cell_state2(),
+ _fully_connected_output(),
+ _pixelwise_mul_output_state1(),
+ _accum_output1(),
+ _activation_output(),
+ _activation_output_state(),
+ _pixelwise_mul_output_state2(),
+ _fully_connected_output_state(),
+ _projection_clip(),
+ _copy_cell_state(),
+ _copy_output(),
+ _concat_scratch_buffer(),
+ _concat_inputs_forget_gate(),
+ _concat_weights_forget_gate(),
+ _concat_weights_input_gate(),
+ _concat_weights_output(),
+ _mean_std_norm_input_gate(),
+ _pixelwise_mul_input_gate_coeff(),
+ _accum_input_gate_bias(),
+ _mean_std_norm_forget_gate(),
+ _pixelwise_mul_forget_gate_coeff(),
+ _accum_forget_gate_bias(),
+ _mean_std_norm_cell_gate(),
+ _pixelwise_mul_cell_gate_coeff(),
+ _accum_cell_gate_bias(),
+ _mean_std_norm_output_gate(),
+ _pixelwise_mul_output_gate_coeff(),
+ _accum_output_gate_bias(),
+ _input_gate_out1(),
+ _input_gate_out2(),
+ _input_gate_out3(),
+ _input_gate_out4(),
+ _forget_gate_out1(),
+ _forget_gate_out2(),
+ _forget_gate_out3(),
+ _forget_gate_out4(),
+ _forget_gate_out5(),
+ _forget_gate_out6(),
+ _cell_state_out1(),
+ _cell_state_out2(),
+ _cell_state_out3(),
+ _cell_state_out4(),
+ _cell_state_out5(),
+ _output1(),
+ _output2(),
+ _output3(),
+ _output4(),
+ _cell_state_activation(),
+ _output_state1(),
+ _ones(),
+ _input_layer_norm_out1(),
+ _input_layer_norm_out2(),
+ _forget_layer_norm_out1(),
+ _forget_layer_norm_out2(),
+ _cell_layer_norm_out1(),
+ _cell_layer_norm_out2(),
+ _output_layer_norm_out1(),
+ _output_layer_norm_out2(),
+ _run_peephole_opt(false),
+ _run_cifg_opt(false),
+ _perform_cell_clipping(false),
+ _has_projection_weights(false),
+ _perform_projection_clipping(false),
+ _is_prepared(false),
_is_layer_norm_lstm(false)
{
}
-void NELSTMLayer::configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *output_state_in, const ITensor *cell_state_in,
- ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
- const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *output_state_in,
+ const ITensor *cell_state_in,
+ ITensor *scratch_buffer,
+ ITensor *output_state_out,
+ ITensor *cell_state_out,
+ ITensor *output,
+ const LSTMParams<ITensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ cell_threshold, projection_threshold);
_is_layer_norm_lstm = lstm_params.use_layer_norm();
@@ -84,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input,
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
- input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- output_state_in->info(), cell_state_in->info(),
- scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
- lstm_params_info, activation_info, cell_threshold, projection_threshold));
+ ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+ cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+ lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
@@ -117,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input,
_concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+ (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
Tensor *forget_gate_out = &_forget_gate_out5;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
- _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+ ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -139,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input,
{
_forget_gate_out3.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_layer_norm_out1);
_memory_group.manage(&_forget_layer_norm_out2);
_mean_std_norm_forget_gate.configure(forget_gate_out);
- _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+ &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
forget_gate_out->allocator()->allocate();
- _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_forget_layer_norm_out1.allocator()->allocate();
forget_gate_out = &_forget_layer_norm_out2;
}
- _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(forget_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -162,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input,
// input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
Tensor *input_gate_out = &_input_gate_out1;
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -184,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out4);
- _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+ (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+ &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out4);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -202,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input,
_input_gate_out1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_input_layer_norm_out1);
_memory_group.manage(&_input_layer_norm_out2);
_mean_std_norm_input_gate.configure(input_gate_out);
- _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+ &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
input_gate_out->allocator()->allocate();
- _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+ &_input_layer_norm_out2, ConvertPolicy::SATURATE);
_input_layer_norm_out1.allocator()->allocate();
input_gate_out = &_input_layer_norm_out2;
}
- _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_input_gate.configure(input_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -229,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input,
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+ &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
_transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
_memory_group.manage(&_cell_state_out3);
@@ -238,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_state_out4);
_accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
Tensor *cell_state_out_ptr = &_cell_state_out4;
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_layer_norm_out1);
_memory_group.manage(&_cell_layer_norm_out2);
_mean_std_norm_cell_gate.configure(cell_state_out_ptr);
- _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+ &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
cell_state_out_ptr->allocator()->allocate();
- _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_cell_layer_norm_out1.allocator()->allocate();
cell_state_out_ptr = &_cell_layer_norm_out2;
}
_activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
cell_state_out_ptr->allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
- if(cell_threshold != 0.f)
+ if (cell_threshold != 0.f)
{
_perform_cell_clipping = true;
- _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ _cell_clip.configure(&_cell_state_out1, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold));
}
// Configure block that calculates the output
@@ -282,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+ &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
Tensor *output_gate_out = &_output4;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
_memory_group.manage(&_output3);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -305,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input,
{
_output1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_output_layer_norm_out1);
_memory_group.manage(&_output_layer_norm_out2);
_mean_std_norm_output_gate.configure(output_gate_out);
- _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+ &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
output_gate_out->allocator()->allocate();
- _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_output_layer_norm_out1.allocator()->allocate();
output_gate_out = &_output_layer_norm_out2;
}
- _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_output.configure(output_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -336,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_state_activation);
_activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
- _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_activation.allocator()->allocate();
output_gate_out->allocator()->allocate();
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
_has_projection_weights = true;
- _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out);
_output_state1.allocator()->allocate();
// Perform clipping
- if(projection_threshold != 0.f)
+ if (projection_threshold != 0.f)
{
_perform_projection_clipping = true;
- _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ _projection_clip.configure(output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -projection_threshold, projection_threshold));
}
}
@@ -359,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input,
// Vector for holding the tensors to store in scratch buffer
std::vector<const ITensor *> scratch_inputs;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
scratch_inputs.emplace_back(input_gate_out);
}
@@ -373,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input,
output_gate_out->allocator()->allocate();
}
-Status NELSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check data types
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check dimensions
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -414,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
- && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+ cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
const unsigned int num_batches = input->dimension(1);
const unsigned int num_cells = input_to_output_weights->dimension(1);
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
// If CIFG is used, input layer normalization weights tensor is omitted
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
}
@@ -435,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
}
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -446,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
}
// Check peephole optimization
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -466,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
// Validate forget gate
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate input gate
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
- lstm_params.recurrent_to_input_weights(),
- lstm_params.input_gate_bias());
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -500,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
- TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, lstm_params.input_to_input_weights(),
+ (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+ &input_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(lstm_params.use_layer_norm())
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(cell_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (cell_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
- cell_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&cell_state_tmp, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold)));
}
// Validate output gate tmp
std::vector<const ITensorInfo *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
- TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+ &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
- if(projection_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out));
+ if (projection_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ output_state_out, output_state_out,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
}
}
@@ -591,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
// Validate scratch concatenation
std::vector<const ITensorInfo *> inputs_vector_info_raw;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
inputs_vector_info_raw.push_back(&input_gate);
}
@@ -612,12 +775,12 @@ void NELSTMLayer::run()
_concat_inputs_forget_gate.run();
_fully_connected_forget_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_forget_gate.run();
_accum_forget_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_forget_gate.run();
_pixelwise_mul_forget_gate_coeff.run();
@@ -625,15 +788,17 @@ void NELSTMLayer::run()
}
_activation_forget_gate.run();
- if(_run_cifg_opt)
+ if (_run_cifg_opt)
{
- if(_ones.info()->data_type() == DataType::F16)
+ if (_ones.info()->data_type() == DataType::F16)
{
- std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 1);
}
else
{
- std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 1);
}
_subtract_input_gate.run();
}
@@ -641,13 +806,13 @@ void NELSTMLayer::run()
{
_fully_connected_input_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_input_gate.run();
_accum_input_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_input_gate.run();
_pixelwise_mul_input_gate_coeff.run();
@@ -660,29 +825,30 @@ void NELSTMLayer::run()
_transpose_cell_state.run();
_gemm_cell_state1.run();
_accum_cell_state1.run();
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_cell_gate.run();
_pixelwise_mul_cell_gate_coeff.run();
_accum_cell_gate_bias.run();
}
+
_activation_cell_state.run();
_pixelwise_mul_cell_state1.run();
_pixelwise_mul_cell_state2.run();
_accum_cell_state2.run();
- if(_perform_cell_clipping)
+ if (_perform_cell_clipping)
{
_cell_clip.run();
}
_fully_connected_output.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_output_state1.run();
_accum_output1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_output_gate.run();
_pixelwise_mul_output_gate_coeff.run();
@@ -693,10 +859,10 @@ void NELSTMLayer::run()
_activation_output_state.run();
_pixelwise_mul_output_state2.run();
- if(_has_projection_weights)
+ if (_has_projection_weights)
{
_fully_connected_output_state.run();
- if(_perform_projection_clipping)
+ if (_perform_projection_clipping)
{
_projection_clip.run();
}
@@ -710,10 +876,10 @@ void NELSTMLayer::run()
void NELSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_concat_weights_forget_gate.run();
- if(!_run_cifg_opt)
+ if (!_run_cifg_opt)
{
_concat_weights_input_gate.run();
}
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 5c0f19a15c..41f9c3d700 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -24,17 +24,10 @@
#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <cmath>
@@ -54,32 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit
NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
- _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
- _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
- _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
- _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
- _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
- _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+ : _memory_group(std::move(memory_manager)),
+ _gemmlowp(),
+ _output_stage(),
+ _transpose_weights(),
+ _concat_input_weights(),
+ _concat_recurrent_weights(),
+ _concat_weights(),
+ _concat_inputs(),
+ _concat_bias(),
+ _sigmoid_forget_gate(),
+ _sigmoid_input_gate(),
+ _sigmoid_output_gate(),
+ _tanh_modulation_gate(),
+ _tanh_output_state(),
+ _add1(),
+ _add2(),
+ _mul1(),
+ _mul2(),
+ _mul3(),
+ _slice_input_tensor(),
+ _slice_forget_tensor(),
+ _slice_cell_tensor(),
+ _slice_output_tensor(),
+ _dequantize(),
+ _quantize(),
+ _input_to_input_weights(nullptr),
+ _input_to_forget_weights(nullptr),
+ _input_to_cell_weights(nullptr),
+ _input_to_output_weights(nullptr),
+ _recurrent_to_input_weights(nullptr),
+ _recurrent_to_forget_weights(nullptr),
+ _recurrent_to_cell_weights(nullptr),
+ _recurrent_to_output_weights(nullptr),
+ _input_gate_bias(nullptr),
+ _forget_gate_bias(nullptr),
+ _cell_bias(nullptr),
+ _output_gate_bias(nullptr),
+ _recurrent_weights(),
+ _input_weights(),
+ _weights(),
+ _input(),
+ _weights_transposed(),
+ _output_highp(),
+ _output_lowp(),
+ _bias(),
+ _forget_gate_input(),
+ _input_gate_input(),
+ _output_gate_input(),
+ _input_modulation_gate_input(),
+ _forget_gate_output(),
+ _input_gate_output(),
+ _output_gate_output(),
+ _input_modulation_gate_output(),
+ _cell_state1(),
+ _cell_state2(),
+ _output_state_tmp(),
+ _output_state_out_symm(),
+ _output_state_out_f32(),
_is_prepared(false)
{
}
void NELSTMLayerQuantized::configure(const ITensor *input,
- const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- ITensor *cell_state_in, const ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out)
+ const ITensor *input_to_input_weights,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_input_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *input_gate_bias,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ ITensor *cell_state_in,
+ const ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
- ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
- input_to_output_weights->info(),
- recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+ input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+ recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+ cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
const int input_size = input->info()->dimension(0);
const int batch_size = input->info()->dimension(1);
@@ -87,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
- auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
- auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+ auto_init_if_empty(*cell_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+ auto_init_if_empty(*output_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
_input_to_input_weights = input_to_input_weights;
_input_to_forget_weights = input_to_forget_weights;
@@ -104,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
_output_gate_bias = output_gate_bias;
// Weights concatenation
- std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
- std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+ std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights};
+ std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights};
- _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _input_weights.allocator()->init(
+ TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
- _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _recurrent_weights.allocator()->init(
+ TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
- std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
- _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+ _weights.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_weights.configure(weights_vector, &_weights, Window::DimX);
_transpose_weights.configure(&_weights, &_weights_transposed);
// Input concatenation
- std::vector<const ITensor *> input_vector{ input, output_state_in };
+ std::vector<const ITensor *> input_vector{input, output_state_in};
_memory_group.manage(&_input);
- _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+ _input.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
_concat_inputs.configure(input_vector, &_input, Window::DimX);
// Bias concatenation
- std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+ std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
_bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
_concat_bias.configure(bias_vector, &_bias, Window::DimX);
// Invert the offset for gemmlowp
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
// Run gemmlowp
_memory_group.manage(&_output_highp);
@@ -141,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
// Set the offset back
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
// multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
_output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -152,69 +227,91 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
- _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+ GEMMLowpOutputStageInfo info;
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ _output_stage.configure(&_output_highp, &_bias, &_output_lowp, info);
_output_highp.allocator()->allocate();
_bias.allocator()->allocate();
// Get the gate tensors
- if(batch_size > 1)
+ if (batch_size > 1)
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+ {2 * output_size, batch_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+ {4 * output_size, batch_size});
_output_lowp.allocator()->allocate();
}
else
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+ {3 * output_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
_output_lowp.allocator()->allocate();
}
// Forget gate
_memory_group.manage(&_forget_gate_output);
- _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_output.allocator()->init(
+ TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_forget_gate_input.allocator()->allocate();
// Input gate
_memory_group.manage(&_input_gate_output);
- _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_output.allocator()->init(
+ TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_input_gate_input.allocator()->allocate();
// Input modulation gate equation
_memory_group.manage(&_input_modulation_gate_output);
- _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _input_modulation_gate_output.allocator()->init(
+ TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_input_modulation_gate_input.allocator()->allocate();
// Output gate
_memory_group.manage(&_output_gate_output);
- _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_output.allocator()->init(
+ TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_output_gate_input.allocator()->allocate();
// Long term memory
_memory_group.manage(&_cell_state1);
- _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state1.allocator()->init(
+ TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_forget_gate_output.allocator()->allocate();
_memory_group.manage(&_cell_state2);
- _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state2.allocator()->init(
+ TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_input_modulation_gate_output.allocator()->allocate();
_input_gate_output.allocator()->allocate();
@@ -224,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
// Short term memory
_memory_group.manage(&_output_state_tmp);
- _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _output_state_tmp.allocator()->init(
+ TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_memory_group.manage(&_output_state_out_symm);
- _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_state_out_symm.allocator()->init(
+ TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate_output.allocator()->allocate();
_output_state_tmp.allocator()->allocate();
// Requantize the output state from QSYMM16 to QASYMM8
_memory_group.manage(&_output_state_out_f32);
- _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+ _output_state_out_f32.allocator()->init(
+ TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
_dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
_output_state_out_symm.allocator()->allocate();
@@ -244,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
}
Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
- output_state_in, cell_state_out, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
const int input_size = input->dimension(0);
const int batch_size = input->dimension(1);
@@ -264,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
- TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
- TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
- TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+ TensorInfo input_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(input_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(output_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo bias_info(
+ input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+ TensorInfo output_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QASYMM8)
+ .set_quantization_info(qasymm));
+ TensorInfo cell_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QSYMM16)
+ .set_quantization_info(qsymm_4));
// Shape checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
// Data type checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
// Quantization checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
@@ -308,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
- ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
// _concat_weights
std::vector<const ITensorInfo *> weights_vector;
@@ -318,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
// _transpose_weights
const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
- TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+ TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
// _concat_inputs
@@ -344,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
// _gemmlowp
const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
// Set the offset back
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -355,78 +494,107 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int32_t output_multiplier = 0;
int32_t output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
// _output_stage
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+ GEMMLowpOutputStageInfo info;
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
TensorInfo input_gate_input;
TensorInfo forget_gate_input;
TensorInfo input_modulation_gate_input;
TensorInfo output_gate_input;
- if(batch_size > 1)
+ if (batch_size > 1)
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
}
else
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
}
// _sigmoid_forget_gate
const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _sigmoid_input_gate
const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _tanh_modulation_gate
- const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+ qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _sigmoid_output_gate
const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _mul_forget_gate_cell_state
const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
// _mul_input_gate_input_mod_gate
const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+ &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _add_cell_state_tmps
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
// _tanh_modulation_gate
const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _mul_output_state_tmp_output_gate
const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+ &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _dequantize
const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -435,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
// _quantize
ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
- if(cell_state_out->total_size() != 0)
+ if (cell_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
}
- if(output_state_out->total_size() != 0)
+ if (output_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -501,7 +669,7 @@ void NELSTMLayerQuantized::run()
void NELSTMLayerQuantized::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_input_weights.allocator()->allocate();
_concat_input_weights.run();
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 171d84da19..0013a521d1 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -25,21 +25,22 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NELogicalKernel.h"
namespace arm_compute
{
struct LogicalArgs
{
- std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+ std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
ITensorPack pack{};
};
struct NELogicalAnd::Impl : public LogicalArgs
{
};
-NELogicalAnd::NELogicalAnd()
- : _impl(std::make_unique<Impl>())
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
{
}
NELogicalAnd::~NELogicalAnd() = default;
@@ -47,6 +48,7 @@ NELogicalAnd::~NELogicalAnd() = default;
void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And);
@@ -70,8 +72,7 @@ void NELogicalAnd::run()
struct NELogicalOr::Impl : public LogicalArgs
{
};
-NELogicalOr::NELogicalOr()
- : _impl(std::make_unique<Impl>())
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
{
}
NELogicalOr::~NELogicalOr() = default;
@@ -79,6 +80,7 @@ NELogicalOr::~NELogicalOr() = default;
void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or);
@@ -102,8 +104,7 @@ void NELogicalOr::run()
struct NELogicalNot::Impl : public LogicalArgs
{
};
-NELogicalNot::NELogicalNot()
- : _impl(std::make_unique<Impl>())
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
{
}
NELogicalNot::~NELogicalNot() = default;
@@ -111,6 +112,7 @@ NELogicalNot::~NELogicalNot() = default;
void NELogicalNot::configure(const ITensor *input, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not);
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
new file mode 100644
index 0000000000..31898bafc4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuMatMul.h"
+
+namespace arm_compute
+{
+struct NEMatMul::Impl
+{
+ const ITensor *lhs{nullptr};
+ const ITensor *rhs{nullptr};
+ ITensor *output{nullptr};
+ std::unique_ptr<cpu::CpuMatMul> op{nullptr};
+ MemoryGroup memory_group{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ ITensorPack run_pack{};
+};
+
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEMatMul::~NEMatMul() = default;
+
+void NEMatMul::configure(ITensor *lhs,
+ ITensor *rhs,
+ ITensor *output,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ _impl->lhs = lhs;
+ _impl->rhs = rhs;
+ _impl->output = output;
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
+ _impl->op = std::make_unique<cpu::CpuMatMul>();
+ _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
+ _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *output,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
+}
+
+void NEMatMul::run()
+{
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 656777d726..c3861afd2c 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,36 +25,66 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEFill.h"
-#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+#include "src/cpu/operators/CpuMaxUnpooling.h"
namespace arm_compute
{
+struct NEMaxUnpoolingLayer::Impl
+{
+ const ITensor *src{nullptr};
+ const ITensor *indices{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
+};
+
NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
-NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
- : _fill_func(), _unpooling_layer_kernel()
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
{
}
-void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEMaxUnpoolingLayer::configure(ITensor *input,
+ ITensor *indices,
+ ITensor *output,
+ const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
+
const PixelValue zero_value(0.f);
- _fill_func = std::make_unique<NEFill>();
- _unpooling_layer_kernel = std::make_unique<NEMaxUnpoolingLayerKernel>();
+ _fill_func = std::make_unique<NEFill>();
+ _impl = std::make_unique<Impl>();
+ _impl->src = input;
+ _impl->indices = indices;
+ _impl->dst = output;
+
+ _impl->op = std::make_unique<cpu::CpuMaxUnpooling>();
_fill_func->configure(output, zero_value);
- _unpooling_layer_kernel->configure(input, indices, output, pool_info);
+ _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
}
-Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
- return NEMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));
+ return Status{};
}
void NEMaxUnpoolingLayer::run()
{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->indices);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
_fill_func->run();
- NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY);
+ _impl->op->run(pack);
}
} /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index 02de983b77..dec0dde56d 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
namespace arm_compute
@@ -31,6 +32,8 @@ NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default;
void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
+
auto k = std::make_unique<NEMeanStdDevNormalizationKernel>();
k->configure(input, output, epsilon);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 9dcb157c03..d6d2e9dc46 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
namespace arm_compute
@@ -43,6 +45,7 @@ NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memor
void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
_input_squared.allocator()->init(tensor_info);
@@ -59,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
_input_squared.allocator()->allocate();
}
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
return Status{};
}
@@ -76,4 +82,4 @@ void NENormalizationLayer::run()
_multiply_f.run();
NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
}
-} \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index a05b545e9a..963e68bac7 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuPRelu.h"
+
+#include "src/cpu/operators/CpuPRelu.h"
namespace arm_compute
{
@@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu;
struct NEPReluLayer::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
};
-NEPReluLayer::NEPReluLayer()
- : _impl(std::make_unique<Impl>())
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
{
}
-NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
NEPReluLayer::~NEPReluLayer() = default;
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 531b06de64..253566df0f 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -23,12 +23,13 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
namespace arm_compute
{
@@ -37,9 +38,9 @@ namespace
uint32_t last_padding_dimension(const PaddingList &padding)
{
int last_padding_dim = padding.size() - 1;
- for(; last_padding_dim >= 0; --last_padding_dim)
+ for (; last_padding_dim >= 0; --last_padding_dim)
{
- if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+ if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
{
break;
}
@@ -51,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding)
NEPadLayer::~NEPadLayer() = default;
NEPadLayer::NEPadLayer()
- : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+ : _copy_function(),
+ _pad_kernel(),
+ _mode(),
+ _padding(),
+ _num_dimensions(0),
+ _slice_functions(),
+ _concat_functions(),
+ _slice_results(),
+ _concat_results()
{
}
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value)
{
_pad_kernel = std::make_unique<NEPadLayerKernel>();
_pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
@@ -84,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
Coordinates ends_after{};
Coordinates strides{};
ITensor *prev = input;
- for(uint32_t i = 0; i < _num_dimensions; ++i)
+ for (uint32_t i = 0; i < _num_dimensions; ++i)
{
// Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
- if(i > 0)
+ if (i > 0)
{
strides.set(i - 1, 1);
}
- if(_padding[i].first > 0 || _padding[i].second > 0)
+ if (_padding[i].first > 0 || _padding[i].second > 0)
{
// Set the starts, ends, and strides values for the current dimension.
// Due to the bit masks passed to strided slice, the values below the current dimension in
// starts and ends will be ignored so do not need to be modified.
- if(_mode == PaddingMode::REFLECT)
+ if (_mode == PaddingMode::REFLECT)
{
starts_before.set(i, _padding[i].first);
ends_before.set(i, 0);
@@ -123,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
// Reflect the input values for the padding before and after the input.
std::vector<const ITensor *> concat_vector;
- if(_padding[i].first > 0)
+ if (_padding[i].first > 0)
{
- if(i < prev->info()->num_dimensions())
+ if (i < prev->info()->num_dimensions())
{
- _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+ begin_mask_before, end_mask_before);
concat_vector.emplace_back(&_slice_results[2 * i]);
}
else
@@ -137,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
}
concat_vector.push_back(prev);
- if(_padding[i].second > 0)
+ if (_padding[i].second > 0)
{
- if(i < prev->info()->num_dimensions())
+ if (i < prev->info()->num_dimensions())
{
- _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+ strides, begin_mask_after, end_mask_after);
concat_vector.emplace_back(&_slice_results[2 * i + 1]);
}
else
@@ -152,8 +166,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
// Concatenate the padding before and after with the input.
ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+ out->info()->set_quantization_info(output->info()->quantization_info());
+ for (auto &v : concat_vector)
+ {
+ v->info()->set_quantization_info(input->info()->quantization_info());
+ }
_concat_functions[i].configure(concat_vector, out, i);
- if(i != _num_dimensions - 1)
+ if (i != _num_dimensions - 1)
{
_concat_results[i].allocator()->allocate();
}
@@ -164,22 +183,28 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
}
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+ ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
_padding = padding;
_mode = mode;
- const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+ const TensorShape padded_shape =
+ misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
// Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
_num_dimensions = last_padding_dimension(padding) + 1;
- if(_num_dimensions > 0)
+ if (_num_dimensions > 0)
{
- switch(_mode)
+ switch (_mode)
{
case PaddingMode::CONSTANT:
{
@@ -203,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
}
}
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- switch(mode)
+ switch (mode)
{
case PaddingMode::CONSTANT:
{
@@ -224,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
case PaddingMode::REFLECT:
case PaddingMode::SYMMETRIC:
{
- for(uint32_t i = 0; i < padding.size(); ++i)
+ for (uint32_t i = 0; i < padding.size(); ++i)
{
- if(mode == PaddingMode::REFLECT)
+ if (mode == PaddingMode::REFLECT)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -249,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
void NEPadLayer::run()
{
- if(_num_dimensions > 0)
+ if (_num_dimensions > 0)
{
- switch(_mode)
+ switch (_mode)
{
case PaddingMode::CONSTANT:
{
@@ -261,15 +290,15 @@ void NEPadLayer::run()
case PaddingMode::REFLECT:
case PaddingMode::SYMMETRIC:
{
- for(uint32_t i = 0; i < _num_dimensions; ++i)
+ for (uint32_t i = 0; i < _num_dimensions; ++i)
{
- if(_padding[i].first > 0 || _padding[i].second > 0)
+ if (_padding[i].first > 0 || _padding[i].second > 0)
{
- if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
{
_slice_functions[2 * i].run();
}
- if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
{
_slice_functions[2 * i + 1].run();
}
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index f707fad757..80cd04ce6c 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -24,19 +24,19 @@
#include "arm_compute/runtime/NEON/functions/NEPermute.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include "src/cpu/operators/CpuPermute.h"
namespace arm_compute
{
struct NEPermute::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuPermute> op{nullptr};
};
-NEPermute::NEPermute()
- : _impl(std::make_unique<Impl>())
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
{
}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 3a2f1984b4..97155a9e74 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuMul.h"
+
+#include "src/cpu/operators/CpuMul.h"
#include <utility>
@@ -32,32 +33,42 @@ namespace arm_compute
{
struct NEPixelWiseMultiplication::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuMul> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuMul> op{nullptr};
};
-NEPixelWiseMultiplication::NEPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void NEPixelWiseMultiplication::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuMul>();
- _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+ _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+ act_info);
}
void NEPixelWiseMultiplication::run()
@@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run()
struct NEComplexPixelWiseMultiplication::Impl
{
- ITensor *src_0{ nullptr };
- ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuComplexMul> op{ nullptr };
+ ITensor *src_0{nullptr};
+ ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
};
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
}
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
new file mode 100644
index 0000000000..e017e8c21d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuPool3d.h"
+
+namespace arm_compute
+{
+struct NEPooling3dLayer::Impl
+{
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuPool3d> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace_tensors{};
+};
+
+NEPooling3dLayer::~NEPooling3dLayer() = default;
+
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info)
+{
+ _impl->src = input;
+ _impl->dst = output;
+ _impl->op = std::make_unique<cpu::CpuPool3d>();
+ _impl->op->configure(input->info(), output->info(), pool_info);
+
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+ return cpu::CpuPool3d::validate(input, output, pool_info);
+}
+
+void NEPooling3dLayer::run()
+{
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+ _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 8d267a32c0..eb9125be3c 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,17 +26,18 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuPool2d.h"
+#include "src/cpu/operators/CpuPool2d.h"
namespace arm_compute
{
struct NEPoolingLayer::Impl
{
- ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- ITensor *indices{ nullptr };
- std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
+ ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ ITensor *indices{nullptr};
+ std::unique_ptr<cpu::CpuPool2d> op{nullptr};
MemoryGroup memory_group{};
ITensorPack run_pack{};
WorkspaceData<Tensor> workspace_tensors{};
@@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl
NEPoolingLayer::~NEPoolingLayer() = default;
-NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _impl(std::make_unique<Impl>())
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
_impl->memory_group = MemoryGroup(std::move(memory_manager));
}
@@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
_impl->op = std::make_unique<cpu::CpuPool2d>();
_impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src},
+ {TensorType::ACL_DST_0, _impl->dst},
+ {TensorType::ACL_DST_1, _impl->indices}};
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
return cpu::CpuPool2d::validate(input, output, pool_info, indices);
}
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index 0c71706586..dbb6bf9df1 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,21 +27,31 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
namespace arm_compute
{
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ const PriorBoxLayerInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
+
auto k = std::make_unique<NEPriorBoxLayerKernel>();
k->configure(input1, input2, output, info);
_kernel = std::move(k);
}
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
}
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 85d62ac058..dd78d10d16 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,34 +23,38 @@
*/
#include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/QuantizationInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
namespace arm_compute
{
using namespace arm_compute::utils::info_helpers;
namespace
{
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
- float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensorInfo *mm_input,
+ const ITensorInfo *mm_weights,
+ const ITensorInfo *bias,
+ float gemmlowp_scale,
+ const TensorInfo *mm_res_info,
+ const TensorInfo *outstage_tensor_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
return Status{};
}
} // namespace
@@ -59,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
{
// Output quantization scale will be different, but ignored here
// since it will be configured at configure() stage.
- const TensorInfo out
- {
- in
- };
+ const TensorInfo out{in};
return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
}
@@ -92,6 +93,8 @@ Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const IT
void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
{
ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info()));
+ ARM_COMPUTE_LOG_PARAMS(src, dst);
+
_src = &src;
_dst = &dst;
_row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x());
@@ -100,39 +103,108 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
void NEQLSTMLayer::TensorCopyKernel::run()
{
- Iterator input_iter{ _src, _window };
- Iterator output_iter{ _dst, _window };
+ Iterator input_iter{_src, _window};
+ Iterator output_iter{_dst, _window};
- execute_window_loop(_window, [&](const Coordinates &)
- {
- memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
- },
- input_iter, output_iter);
+ execute_window_loop(
+ _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+ output_iter);
}
NEQLSTMLayer::~NEQLSTMLayer() = default;
NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
- _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
- _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
- _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
- _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
- _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
- _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
- _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
- _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
- _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
- _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
+ : _memory_group(),
+ _dequantize_input_to_forget_weights(),
+ _quantize_input_to_forget_weights(),
+ _transpose_input_to_forget_weights(),
+ _transpose_input_to_cell_weights(),
+ _transpose_input_to_output_weights(),
+ _transpose_input_to_input_weights(),
+ _transpose_recurrent_to_forget_weights(),
+ _transpose_recurrent_to_cell_weights(),
+ _transpose_recurrent_to_output_weights(),
+ _transpose_recurrent_to_input_weights(),
+ _transpose_projection_weights(),
+ _input_to_input_reduction(),
+ _recurrent_to_input_reduction(),
+ _input_to_forget_reduction(),
+ _recurrent_to_forget_reduction(),
+ _input_to_cell_reduction(),
+ _recurrent_to_cell_reduction(),
+ _input_to_output_reduction(),
+ _recurrent_to_output_reduction(),
+ _projection_reduction(),
+ _projection_bias_add(),
+ _mm_input_to_forget(),
+ _mm_recurrent_to_forget(),
+ _pixelwise_mul_cell_to_forget(),
+ _input_to_forget_outstage(),
+ _recurrent_to_forget_outstage(),
+ _cell_to_forget_outstage(),
+ _accumulate_input_recurrent_forget(),
+ _accumulate_cell_forget(),
+ _forget_gate_sigmoid(),
+ _mm_input_to_cell(),
+ _input_to_cell_outstage(),
+ _mm_recurrent_to_cell(),
+ _recurrent_to_cell_outstage(),
+ _accumulate_input_recurrent_modulation(),
+ _cell_gate_tanh(),
+ _input_gate_sub(),
+ _mm_input_to_input(),
+ _input_to_input_outstage(),
+ _mm_recurrent_to_input(),
+ _recurrent_to_input_outstage(),
+ _accumulate_input_recurrent_input(),
+ _pixelwise_mul_cell_to_input(),
+ _cell_to_input_outstage(),
+ _accumulate_cell_input(),
+ _input_gate_sigmoid(),
+ _pixelwise_mul_forget_cell(),
+ _pixelwise_mul_input_cell(),
+ _add_forget_cell(),
+ _cell_clip(),
+ _mm_input_to_output(),
+ _input_to_output_outstage(),
+ _mm_recurrent_to_output(),
+ _recurrent_to_output_outstage(),
+ _accumulate_input_recurrent_output(),
+ _pixelwise_mul_cell_to_output(),
+ _cell_to_output_outstage(),
+ _accumulate_cell_to_output(),
+ _output_gate_sigmoid(),
+ _hidden_tanh(),
+ _pixelwise_mul_hidden(),
+ _hidden_outstage(),
+ _mm_projection(),
+ _projection_outstage(),
+ _accumulate_projection(),
+ _projection_clip(),
+ _projection_bias_copy(),
+ _projection_output_to_accumulate_copy(),
+ _projection_accumulate_to_output_copy(),
+ _hidden_to_output_copy(),
+ _layer_norms(),
+ _copy_output(),
+ _layer_norm_weights(),
+ _layer_norm_bias(),
_layer_norm_output()
{
_memory_group = MemoryGroup(std::move(memory_manager));
}
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
- Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+ NEGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensor *mm_input,
+ const ITensor *mm_weights,
+ const ITensor *bias,
+ Tensor *mm_res,
+ Tensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info)
{
_memory_group.manage(mm_res);
_memory_group.manage(outstage_res);
@@ -144,33 +216,88 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp
mm.configure(mm_input, mm_weights, nullptr, mm_res);
// Configure output stage
- quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
mm_res->allocator()->allocate();
}
-void NEQLSTMLayer::configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *cell_state_in, ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *cell_state_in,
+ ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out,
+ ITensor *output,
const LSTMParams<ITensor> &lstm_params)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
// Set lstm parameters
LSTMParams<ITensorInfo> lstm_params_info{};
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
- // Validate
- ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
- lstm_params_info));
+ _input_to_forget_weights_transposed.info()->set_quantization_info(
+ input_to_forget_weights->info()->quantization_info());
+ _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
+ _input_to_output_weights_transposed.info()->set_quantization_info(
+ input_to_output_weights->info()->quantization_info());
+ _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+ recurrent_to_forget_weights->info()->quantization_info());
+ _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+ recurrent_to_cell_weights->info()->quantization_info());
+ _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+ recurrent_to_output_weights->info()->quantization_info());
+
+ if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+ {
+ _convert_input_to_forget_weights_to_qsymm8 = true;
+ // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
+
+ _input_to_forget_weights_f32.allocator()->init(
+ TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+ .set_data_layout(input_to_forget_weights->info()->data_layout()));
+ // Setup the quantize output tensor to go from F32 -> QSYMM8
+ _input_to_forget_weights_symm8.allocator()->init(
+ (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+ .set_data_layout(input_to_forget_weights->info()->data_layout())
+ .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+
+ _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
+ _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
+
+ ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+ input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+ recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+ output->info(), lstm_params_info));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+ recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+ output->info(), lstm_params_info));
+ }
const int batch_size = input->info()->dimension(1);
const int num_units = input_to_output_weights->info()->dimension(1);
@@ -181,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
_projection_bias = lstm_params.projection_bias();
- _input_to_forget_weights = input_to_forget_weights;
+ _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+ ? &_input_to_forget_weights_symm8
+ : input_to_forget_weights;
_input_to_cell_weights = input_to_cell_weights;
_input_to_output_weights = input_to_output_weights;
_recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -191,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Layer normalization
_has_layer_norm = lstm_params.use_layer_norm();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -213,44 +342,59 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
_has_cell_clipping = quantized_cell_clip > 0;
// Precompute effective bias for optimizing the matmul computations.
- if(!_has_cifg)
+ if (!_has_cifg)
{
_input_to_input_weights = lstm_params.input_to_input_weights();
_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
- _input_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction->configure(
+ _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
}
- _input_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-
- _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- if(_has_projection)
+ _input_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+
+ _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction->configure(
+ recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction->configure(
+ recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction->configure(
+ recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ if (_has_projection)
{
- _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
- if(_projection_bias != nullptr)
+ _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _projection_reduction->configure(
+ _projection_weights->info(), _projection_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ if (_projection_bias != nullptr)
{
- _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+ _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+ ConvertPolicy::SATURATE);
}
}
@@ -258,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input,
_transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
_transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
_transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
- _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+ _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+ &_recurrent_to_forget_weights_transposed);
_transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
- _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
- if(!_has_cifg)
+ _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+ &_recurrent_to_output_weights_transposed);
+ if (!_has_cifg)
{
- _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
- _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+ _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+ &_input_to_input_weights_transposed);
+ _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+ &_recurrent_to_input_weights_transposed);
}
- if(_has_projection)
+ if (_has_projection)
{
_transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
}
@@ -279,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input,
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
// Forget gate.
- const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
- const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
- input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
- &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
- &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+ const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+ &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+ &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+ &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
_input_to_forget_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
_mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_forget_res);
- _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ _cell_to_forget_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_forget_outstage_res);
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+ const float cell_to_forget_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+ gemmlowp_info);
_mul_cell_to_forget_res.allocator()->allocate();
- _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
_cell_to_forget_outstage_res.allocator()->allocate();
}
Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
forget_activation_input->allocator()->allocate();
@@ -321,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Output quantization info of Sigmoid and Tanh activations
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
- const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_forget_gate);
_forget_gate.allocator()->init(forget_gate_info);
- _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
forget_activation_input->allocator()->allocate();
// Modulation gate.
- const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
- input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
- &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+ const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+ &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
mm_out_info, cell_outstage_info);
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
- &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
- mm_out_info, cell_outstage_info);
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+ &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
- _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+ &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
_input_to_cell_outstage_res.allocator()->allocate();
Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
cell_activation_input->allocator()->allocate();
@@ -358,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_gate);
_cell_gate.allocator()->init(cell_gate_info);
- _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
cell_activation_input->allocator()->allocate();
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_input_gate.allocator()->init(input_gate_info);
_memory_group.manage(&_input_gate);
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.allocator()->init(*_forget_gate.info());
_input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -373,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input,
}
else
{
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
- input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
- &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
- mm_out_info, input_outstage_info);
-
- const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+ &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+ &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+ const float recurrent_to_input_scale =
+ _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
mm_out_info, input_outstage_info);
- _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_input_to_input_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
- _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _mul_cell_to_input_res.allocator()->init(
+ TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_input_res);
- _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+ &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ const float cell_to_input_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_input_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_input_outstage_res);
- _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+ _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+ gemmlowp_info);
_mul_cell_to_input_res.allocator()->allocate();
- _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_cell_to_input_outstage_res.allocator()->allocate();
}
Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Input, input_activation_input);
input_activation_input->allocator()->allocate();
input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
}
- _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
input_activation_input->allocator()->allocate();
}
// Cell.
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
- _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
- const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+ const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(mul_input_cell_scale, 0));
_memory_group.manage(&_mul_input_cell_res);
_mul_input_cell_res.allocator()->init(mul_input_cell_info);
- _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_cell_gate.allocator()->allocate();
_add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
_mul_input_cell_res.allocator()->allocate();
_forget_gate.allocator()->allocate();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
- _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+ _cell_clip.configure(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip));
}
// Output gate.
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
- input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
- &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
- mm_out_info, output_outstage_info);
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
- &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
- mm_out_info, output_outstage_info);
-
- _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.output_intermediate_scale();
+ configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+ &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+ &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+ &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
+
+ _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+ &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
_input_to_output_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
_mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_output_res);
- _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
- const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+ &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+
+ const float cell_to_output_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.output_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_output_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_output_outstage_res);
- _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+ _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+ gemmlowp_info);
_mul_cell_to_output_res.allocator()->allocate();
- _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+ &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
_cell_to_output_outstage_res.allocator()->allocate();
}
Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Output, output_activation_input);
output_activation_input->allocator()->allocate();
@@ -480,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_output_gate);
_output_gate.allocator()->init(output_gate_info);
- _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
output_activation_input->allocator()->allocate();
// Hidden.
- _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _hidden_tanh.configure(cell_state_out, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
_memory_group.manage(&_hidden_mul_res);
const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
_hidden_mul_res.allocator()->init(hidden_mul_res);
- _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate.allocator()->allocate();
_input_gate.allocator()->allocate();
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = output_state_in->info()->data_type();
@@ -502,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_hidden_gate);
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->init(*output_state_out->info());
_hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -513,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input,
_hidden_mul_res.allocator()->allocate();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
const TensorInfo projection_outstage_info(*output_state_out->info());
- const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
- gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
- gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
- gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
-
- TensorInfo projection_mm_out_info{ mm_out_info };
+ const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
- hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
- &_mm_projection_res, &_projection_outstage_res, projection_scale,
- projection_mm_out_info, projection_outstage_info);
+ configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+ &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+ &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
ITensor *accumulate_destination = output_state_out;
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->allocate();
_projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -542,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input,
accumulate_destination = &_projection_accumulate_res;
}
- _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+ _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+ ConvertPolicy::SATURATE);
_projection_outstage_res.allocator()->allocate();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
_projection_accumulate_res.allocator()->allocate();
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
- quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+ quantized_projection_clip =
+ utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+ _projection_clip.configure(output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip));
_has_projection_clipping = true;
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
_hidden_gate.allocator()->allocate();
@@ -576,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input,
_copy_output.configure(output_state_out, output);
}
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
- cell_state_out, output_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ cell_state_in, output_state_in, cell_state_out, output_state_out, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -598,14 +812,28 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+ input_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8);
+ // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+ if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
+ }
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
@@ -623,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
// Check whether peephole weights are all there or none
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
}
}
@@ -650,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
@@ -658,49 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Precompute effective bias for optimizing the matmul computations.
const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.input_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_forget_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_cell_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_output_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
- lstm_params.hidden_state_zero(),
- true)));
- if(lstm_params.projection_bias() != nullptr)
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.projection_weights(), &projection_eff_bias_info,
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+ if (lstm_params.projection_bias() != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+ &projection_eff_bias_info, ConvertPolicy::SATURATE));
}
}
- const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
- const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+ const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+ input_to_cell_weights->quantization_info());
+ const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+ input_to_output_weights->data_type(),
+ input_to_output_weights->quantization_info());
+ const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
+ const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_cell_weights->data_type(),
+ recurrent_to_cell_weights->quantization_info());
+ const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_output_weights->data_type(),
+ recurrent_to_output_weights->quantization_info());
+ const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
- // Validate weights transpose
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
- if(!lstm_params.has_cifg_opt())
+ ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+ const TensorInfo recurrent_to_input_weights_transposed(
+ TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+ lstm_params.recurrent_to_input_weights()->quantization_info());
+ const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
+ lstm_params.input_to_input_weights()->data_type(),
+ lstm_params.input_to_input_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
}
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
}
GEMMLowpOutputStageInfo gemmlowp_info;
@@ -713,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Forget gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
- const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
- const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_forget_scale, &mm_out_info, &forget_outstage_info));
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+ &forget_outstage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_forget_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
const ITensorInfo *b_info = forget_gate_bias;
@@ -743,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Output quantization info of Sigmoid and Tanh activations
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
- const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Modulation gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
- const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
- if(has_layer_norm)
+ const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+ &cell_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+ &cell_outstage_info, ConvertPolicy::SATURATE));
+
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
const ITensorInfo *b_info = cell_bias;
@@ -766,85 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+ "Input gate bias must not be present when CIFG is used");
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+ &forget_gate_info, ConvertPolicy::SATURATE));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+
+ // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+ if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights());
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+ lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights());
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+ lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
- const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
- if(lstm_params.has_peephole_opt())
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ const float recurrent_to_input_scale =
+ lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+ &input_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
+
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+ 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_input_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
const ITensorInfo *b_info = lstm_params.input_gate_bias();
ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
}
// Cell.
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
- if(quantized_cell_clip > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+ if (quantized_cell_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
- quantized_cell_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip)));
}
// Output gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+ &output_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+ DataType::QSYMM16);
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
const ITensorInfo *b_info = output_gate_bias;
@@ -852,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Hidden.
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = hidden_out_info.data_type();
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
const bool projection_tensor_copy_required = num_units != output_size;
// Projection.
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+ lstm_params.projection_weights());
ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
- const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
const TensorInfo projection_outstage_info(*output_state_out);
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
- TensorInfo projection_mm_out_info{ mm_out_info };
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+ &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
&projection_outstage_info));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+ ConvertPolicy::SATURATE));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip)));
}
}
else
{
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
}
}
- if(cell_state_out->total_size() > 0)
+ if (cell_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
}
- if(output_state_out->total_size() > 0)
+ if (output_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -955,14 +1319,14 @@ void NEQLSTMLayer::run()
_recurrent_to_forget_outstage.run();
_accumulate_input_recurrent_forget.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_forget.run();
_cell_to_forget_outstage.run();
_accumulate_cell_forget.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
}
@@ -977,7 +1341,7 @@ void NEQLSTMLayer::run()
_recurrent_to_cell_outstage.run();
_accumulate_input_recurrent_modulation.run();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
}
@@ -985,7 +1349,7 @@ void NEQLSTMLayer::run()
_cell_gate_tanh.run();
// Input gate
- if(_has_cifg)
+ if (_has_cifg)
{
_input_gate_sub.run();
}
@@ -997,14 +1361,14 @@ void NEQLSTMLayer::run()
_recurrent_to_input_outstage.run();
_accumulate_input_recurrent_input.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_input.run();
_cell_to_input_outstage.run();
_accumulate_cell_input.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
}
@@ -1017,7 +1381,7 @@ void NEQLSTMLayer::run()
_pixelwise_mul_input_cell.run();
_add_forget_cell.run();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
_cell_clip.run();
}
@@ -1028,14 +1392,14 @@ void NEQLSTMLayer::run()
_mm_recurrent_to_output.run();
_recurrent_to_output_outstage.run();
_accumulate_input_recurrent_output.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_output.run();
_cell_to_output_outstage.run();
_accumulate_cell_to_output.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
}
@@ -1048,31 +1412,31 @@ void NEQLSTMLayer::run()
_hidden_outstage.run();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
_mm_projection.run();
_projection_outstage.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_output_to_accumulate_copy.run();
}
_accumulate_projection.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.run();
}
- if(_has_projection_clipping)
+ if (_has_projection_clipping)
{
_projection_clip.run();
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.run();
}
@@ -1084,8 +1448,16 @@ void NEQLSTMLayer::run()
void NEQLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
+ if (_convert_input_to_forget_weights_to_qsymm8)
+ {
+ _input_to_forget_weights_f32.allocator()->allocate();
+ _input_to_forget_weights_symm8.allocator()->allocate();
+ _dequantize_input_to_forget_weights.run();
+ _quantize_input_to_forget_weights.run();
+ }
+
// Pre-transpose weights to be used in GEMM.
_input_to_forget_weights_transposed.allocator()->allocate();
_input_to_cell_weights_transposed.allocator()->allocate();
@@ -1101,16 +1473,25 @@ void NEQLSTMLayer::prepare()
_transpose_recurrent_to_output_weights.run();
// Precompute effective biases
- if(_has_cifg)
+ if (_has_cifg)
{
- std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+ std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 32767);
}
else
{
_input_to_input_eff_bias.allocator()->allocate();
_recurrent_to_input_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
+
+ ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+ {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+ _input_to_input_reduction->window(), packII);
+
+ ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+ {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+ _recurrent_to_input_reduction->window(), packRI);
_input_to_input_weights_transposed.allocator()->allocate();
_recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1125,18 +1506,45 @@ void NEQLSTMLayer::prepare()
_recurrent_to_cell_eff_bias.allocator()->allocate();
_input_to_output_eff_bias.allocator()->allocate();
_recurrent_to_output_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
-
- if(_has_projection)
+
+ ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+ {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+ _input_to_forget_reduction->window(), packIF);
+
+ ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+ {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+ _recurrent_to_forget_reduction->window(), packRF);
+
+ ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+ {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+ packIC);
+
+ ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+ {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+ _recurrent_to_cell_reduction->window(), packRC);
+
+ ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+ {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+ _input_to_output_reduction->window(), packIO);
+
+ ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+ {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+ _recurrent_to_output_reduction->window(), packRO);
+
+ if (_has_projection)
{
_projection_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
- if(_projection_bias != nullptr)
+ ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+ {TensorType::ACL_DST, &_projection_eff_bias}};
+ NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+ pack);
+ if (_projection_bias != nullptr)
{
_projection_bias_add.run();
_projection_bias->mark_as_unused();
@@ -1146,7 +1554,7 @@ void NEQLSTMLayer::prepare()
_transpose_projection_weights.run();
_projection_weights->mark_as_unused();
- if(!_projection_tensor_copy_required)
+ if (!_projection_tensor_copy_required)
{
_hidden_gate.mark_as_unused();
_projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index e607917615..9b72783c97 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,19 +26,19 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuQuantize.h"
+
+#include "src/cpu/operators/CpuQuantize.h"
namespace arm_compute
{
struct NEQuantizationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuQuantize> op{nullptr};
};
-NEQuantizationLayer::NEQuantizationLayer()
- : _impl(std::make_unique<Impl>())
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
{
}
NEQuantizationLayer::~NEQuantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index d59f7da0dd..2824693800 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -27,31 +27,37 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
NERNNLayer::~NERNNLayer() = default;
NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
+ : _memory_group(std::move(memory_manager)),
+ _gemm_state_f(),
+ _add_f(),
+ _activation(),
+ _fully_connected(memory_manager),
+ _copy_f(),
+ _fully_connected_out(),
+ _gemm_output(),
+ _add_output(),
_is_prepared(false)
{
}
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
- const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -68,23 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
- auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+ auto shape_info =
+ TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+ input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
return Status{};
}
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *recurrent_weights,
+ const ITensor *bias,
+ ITensor *hidden_state,
+ ITensor *output,
ActivationLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+ bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+ TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+ hidden_state->info()->dimension(idx_height));
_is_prepared = false;
@@ -132,7 +149,7 @@ void NERNNLayer::run()
void NERNNLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_fully_connected.prepare();
_gemm_state_f.prepare();
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a946358e18..68bb5d5ef3 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,20 +23,29 @@
*/
#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
namespace arm_compute
{
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
return Status{};
}
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor *input,
+ const ITensor *rois,
+ ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
// Configure ROI pooling kernel
auto k = std::make_unique<NEROIAlignLayerKernel>();
k->configure(input, rois, output, pool_info);
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index f9434059ea..babec4aa92 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -22,26 +22,36 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
namespace arm_compute
{
NEROIPoolingLayer::~NEROIPoolingLayer() = default;
-NEROIPoolingLayer::NEROIPoolingLayer()
- : _roi_kernel()
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
{
}
-Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
}
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input,
+ const ITensor *rois,
+ const ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
_roi_kernel = std::make_unique<NEROIPoolingLayerKernel>();
_roi_kernel->configure(input, rois, output, pool_info);
}
@@ -50,4 +60,4 @@ void NEROIPoolingLayer::run()
{
NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 56ef2bf657..95492df126 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,19 +24,21 @@
#include "arm_compute/runtime/NEON/functions/NERange.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NERangeKernel.h"
namespace arm_compute
{
NERange::~NERange() = default;
-NERange::NERange()
- : _kernel()
+NERange::NERange() : _kernel()
{
}
void NERange::configure(ITensor *output, const float start, const float end, const float step)
{
+ ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
_kernel = std::make_unique<NERangeKernel>();
_kernel->configure(output, start, end, step);
}
@@ -50,4 +52,4 @@ void NERange::run()
{
NEScheduler::get().schedule(_kernel.get(), Window::DimX);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index b50a925f44..a23db87059 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,23 +24,25 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
namespace
{
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
@@ -48,29 +50,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
//axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
// Only validate if not using auto_init for the output tensor
TensorShape out_shape = input->tensor_shape();
// Validate output_shape only if not using auto_init
convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+ for (unsigned int i = 0; i < reduction_ops; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
+ if (output->total_size() > 0 && keep_dims)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
}
- if(keep_dims)
+ if (keep_dims)
{
out_shape.set(axis_local[i], 1);
}
@@ -79,19 +88,11 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
const unsigned int remove_index = axis_local[i] - i;
ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
- out_shape.remove_dimension(remove_index);
+ out_shape.remove_dimension(remove_index, false);
}
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
- if(requant)
- {
- TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
- NEDequantizationLayer::validate(input, &input_no_quant);
- TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
- NEQuantizationLayer::validate(&output_no_quant, output);
- }
}
return Status{};
}
@@ -100,25 +101,34 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
NEReduceMean::~NEReduceMean() = default;
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
- _output_no_quant()
+ : _memory_group(std::move(memory_manager)),
+ _reduction_kernels(),
+ _reduced_outs(),
+ _reshape(),
+ _reduction_ops(),
+ _keep_dims()
{
}
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status NEReduceMean::validate(const ITensorInfo *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ const ITensorInfo *output)
{
return validate_config(input, reduction_axis, keep_dims, output);
}
void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
+
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -126,18 +136,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
ITensor *tmp_input = input;
ITensor *tmp_output = output;
- if(_do_requant)
- {
- _memory_group.manage(&_input_no_quant);
- _memory_group.manage(&_output_no_quant);
- TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
- output_no_quant_info.set_data_type(DataType::F32);
- auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
- auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
- _dequant.configure(input, &_input_no_quant);
- tmp_input = &_input_no_quant;
- tmp_output = &_output_no_quant;
- }
Coordinates axis_local = reduction_axis;
const int input_dims = tmp_input->info()->num_dimensions();
@@ -145,70 +143,65 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(int i = 0; i < _reduction_ops; ++i)
+ for (int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ TensorShape out_shape =
+ i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
- if(i == _reduction_ops - 1 && keep_dims)
+ if (i == _reduction_ops - 1 && keep_dims)
{
_reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+ tmp_output->info()->data_type(),
+ tmp_output->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
_reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
// Allocate intermediate tensors
- for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
-
// Configure reshape layer if we want to drop the dimensions
- if(!keep_dims)
+ if (!keep_dims)
{
TensorShape out_shape = tmp_input->info()->tensor_shape();
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(int i = 0; i < _reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+ for (int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_local[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i, false);
}
auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output);
}
- if(_do_requant)
- {
- _requant.configure(&_output_no_quant, output);
- _input_no_quant.allocator()->allocate();
- _output_no_quant.allocator()->allocate();
- }
}
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_do_requant)
- {
- _dequant.run();
- }
- for(auto &kernel : _reduction_kernels)
+ for (auto &kernel : _reduction_kernels)
{
kernel.run();
}
- if(!_keep_dims)
+ if (!_keep_dims)
{
_reshape.run();
}
- if(_do_requant)
- {
- _requant.run();
- }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 5d6f520a52..8540d750fc 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,8 +26,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
@@ -41,7 +43,7 @@ namespace
*/
size_t reduction_window_split_dimension(unsigned int axis)
{
- switch(axis)
+ switch (axis)
{
case 0:
return Window::DimY;
@@ -58,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis)
NEReductionOperation::~NEReductionOperation() = default;
NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+ : _memory_group(memory_manager),
+ _reduction_kernel(),
+ _reshape(),
+ _output_internal(),
+ _window_split(0),
+ _reduction_axis(),
+ _is_reshape_required(false)
{
}
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+ const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
const auto is_reshape_required = !keep_dims;
@@ -73,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
TensorInfo info_before_reshape;
- if(is_reshape_required)
+ if (is_reshape_required)
{
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
auto shape_before_reshape = input->tensor_shape();
@@ -83,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
const auto input_num_channles = input->num_channels();
const auto input_qinfo = input->quantization_info();
- const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
- const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
+ const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
- info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+ info_before_reshape.set_data_type(output_data_type)
+ .set_tensor_shape(shape_before_reshape)
+ .set_num_channels(input_num_channles)
+ .set_quantization_info(input_qinfo);
output_internal = &info_before_reshape;
}
ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
- if(is_reshape_required)
+ if (is_reshape_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
}
@@ -101,28 +115,43 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
return Status{};
}
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+ ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
_is_reshape_required = !keep_dims;
auto *output_internal = output;
const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
- const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
- const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
- const auto num_channels = input->info()->num_channels();
- const auto qinfo = input->info()->quantization_info();
-
- _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
- num_channels).set_quantization_info(qinfo));
+ const auto output_internal_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+ const auto output_external_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+ const auto num_channels = input->info()->num_channels();
+ const auto qinfo = input->info()->quantization_info();
+
+ _output_internal.allocator()->init(input->info()
+ ->clone()
+ ->set_data_type(output_data_type)
+ .set_tensor_shape(output_internal_shape)
+ .reset_padding()
+ .set_is_resizable(true)
+ .set_num_channels(num_channels)
+ .set_quantization_info(qinfo));
_memory_group.manage(&_output_internal);
output_internal = &_output_internal;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_data_type(output_data_type)
+ .set_tensor_shape(output_external_shape)
+ .reset_padding()
+ .set_is_resizable(true));
}
ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
@@ -133,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
_window_split = reduction_window_split_dimension(axis);
_reduction_axis = axis;
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.configure(output_internal, output);
_output_internal.allocator()->allocate();
@@ -144,7 +173,7 @@ void NEReductionOperation::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.run();
}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
deleted file mode 100644
index d9fd987480..0000000000
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
-
- auto k = std::make_unique<NERemapKernel>();
- k->configure(input, map_x, map_y, output, policy, border_mode, constant_border_value);
- _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantize.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
index 5af7f6343b..89cf575f38 100644
--- a/src/runtime/cpu/operators/CpuQuantize.cpp
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,38 +21,46 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#if defined(__aarch64__)
-#include "src/runtime/cpu/operators/CpuQuantize.h"
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
+
+#include "src/core/NEON/kernels/NEReorderKernel.h"
namespace arm_compute
{
-namespace cpu
-{
-Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+NEReorderLayer::~NEReorderLayer() = default;
+
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
{
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst));
- return Status{};
}
-void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+void NEReorderLayer::configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ auto k = std::make_unique<NEReorderKernel>();
+ k->configure(input, output, input_wf, output_wf);
+ _reorder_kernel = std::move(k);
+}
- // Configure quantize kernel
- auto k = std::make_unique<kernels::CpuQuantizeKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
+void NEReorderLayer::run()
+{
+ // Run Reorder
+ NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
}
-void CpuQuantize::run(ITensorPack &tensors)
+Status NEReorderLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+ return NEReorderKernel::validate(input, output, input_wf, output_wf);
}
-} // namespace cpu
+
} // namespace arm_compute
+
+#endif // defined(__aarch64__)
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index 23ca3a4eea..14e41d6df4 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,15 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
namespace arm_compute
{
void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, stride);
+
auto k = std::make_unique<NEReorgLayerKernel>();
k->configure(input, output, stride);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index c0c78ea652..bed70ff66c 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuReshape.h"
+
+#include "src/cpu/operators/CpuReshape.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NEReshapeLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuReshape> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuReshape> op{nullptr};
};
-NEReshapeLayer::NEReshapeLayer()
- : _impl(std::make_unique<Impl>())
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
{
}
-NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
NEReshapeLayer::~NEReshapeLayer() = default;
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index 36127ef83c..a90f8d2e76 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,19 +23,25 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReverseKernel.h"
namespace arm_compute
{
-void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis);
+
auto k = std::make_unique<NEReverseKernel>();
- k->configure(input, output, axis);
+ k->configure(input, output, axis, use_inverted_axis);
_kernel = std::move(k);
}
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverse::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
- return NEReverseKernel::validate(input, output, axis);
+ return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0fbad07d0f..0d011064f6 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,32 +23,34 @@
*/
#include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/utils/ScaleUtils.h"
-#include "src/runtime/cpu/operators/CpuScale.h"
-#include "support/Rounding.h"
+#include "src/cpu/operators/CpuScale.h"
namespace arm_compute
{
struct NEScale::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */
- Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
- Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
- std::unique_ptr<cpu::CpuScale> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+ Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+ Tensor offsets{
+ nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+ std::unique_ptr<cpu::CpuScale> op{nullptr};
};
-NEScale::NEScale()
- : _impl(std::make_unique<Impl>())
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
{
}
NEScale::~NEScale() = default;
void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, info);
+
_impl->src = input;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuScale>();
@@ -56,50 +58,71 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
// Configure for size of allocation of internal tensors
// Get data layout and width/height indices
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout =
+ info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+ input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+ input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : info.interpolation_policy;
// Get the tensor shape
TensorShape shape(output->info()->dimension(idx_width));
shape.set(1, output->info()->dimension(idx_height), false);
- const TensorInfo tensor_info_dxdy(shape, Format::F32);
- const TensorInfo tensor_info_offsets(shape, Format::S32);
+ bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+ data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
- _impl->dx.allocator()->init(tensor_info_dxdy);
- _impl->dy.allocator()->init(tensor_info_dxdy);
- _impl->offsets.allocator()->init(tensor_info_offsets);
- switch(policy_to_use)
+ if (precompute_indices_weights)
{
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- {
- // Allocate once the configure methods have been called
- _impl->offsets.allocator()->allocate();
- break;
- }
- case InterpolationPolicy::BILINEAR:
+ const TensorInfo tensor_info_dxdy(shape, Format::F32);
+ const TensorInfo tensor_info_offsets(shape, Format::S32);
+
+ _impl->dx.allocator()->init(tensor_info_dxdy);
+ _impl->dy.allocator()->init(tensor_info_dxdy);
+ _impl->offsets.allocator()->init(tensor_info_offsets);
+ switch (policy_to_use)
{
- // Allocate once the configure methods have been called
- _impl->dx.allocator()->allocate();
- _impl->dy.allocator()->allocate();
- _impl->offsets.allocator()->allocate();
- break;
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ {
+ // Allocate once the configure methods have been called
+ _impl->offsets.allocator()->allocate();
+ break;
+ }
+ case InterpolationPolicy::BILINEAR:
+ {
+ // Allocate once the configure methods have been called
+ _impl->dx.allocator()->allocate();
+ _impl->dy.allocator()->allocate();
+ _impl->offsets.allocator()->allocate();
+ break;
+ }
+ case InterpolationPolicy::AREA:
+ {
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- case InterpolationPolicy::AREA:
+ }
+ else
+ {
+ if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+ policy_to_use != InterpolationPolicy::AREA)
{
- break;
- }
- default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+ }
}
}
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index f8ba9f03ed..55cad2202b 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,12 +24,16 @@
#include "arm_compute/runtime/NEON/functions/NESelect.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESelectKernel.h"
namespace arm_compute
{
void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
+
auto k = std::make_unique<NESelectKernel>();
k->configure(c, x, y, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 9b08bca38a..12d43adc84 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,17 +25,23 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void NESlice::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -45,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo
_kernel = std::move(k);
}
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
// Check start dimensions for being non-negative
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
- {
- return i < 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -64,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
struct NESlice::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NESlice> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<experimental::NESlice> op{nullptr};
};
-NESlice::NESlice()
- : _impl(std::make_unique<Impl>())
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
{
}
-NESlice::NESlice(NESlice &&) = default;
+NESlice::NESlice(NESlice &&) = default;
NESlice &NESlice::operator=(NESlice &&) = default;
NESlice::~NESlice() = default;
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
return experimental::NESlice::validate(input, output, starts, ends);
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index bee692c08b..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,26 +22,26 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/cpu/operators/CpuSoftmax.h"
+#include "src/cpu/operators/CpuSoftmax.h"
namespace arm_compute
{
template <bool IS_LOG>
struct NESoftmaxLayerGeneric<IS_LOG>::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- Tensor max{ nullptr };
- std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr };
- MemoryGroup memory_group{};
- ITensorPack run_pack{};
- WorkspaceData<Tensor> workspace_tensors{};
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace_tensors{};
};
template <bool IS_LOG>
@@ -53,9 +53,9 @@ NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryMana
template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
-template <bool IS_LOG>
+template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
-template <bool IS_LOG>
+template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
template <bool IS_LOG>
@@ -65,23 +65,24 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
_impl->src = input;
_impl->dst = output;
- _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
- _impl->op->configure(input->info(), output->info(), beta, axis);
+ _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>();
+ _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
return Status{};
}
template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
{
// Acquire all the temporaries
MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index e8a84246fe..556ebdd800 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -28,24 +28,29 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
namespace arm_compute
{
NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
-NESpaceToBatchLayer::NESpaceToBatchLayer()
- : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
{
}
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+ const ITensor *block_shape,
+ const ITensor *paddings,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
_fill_f = std::make_unique<NEFill>();
@@ -55,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s
_space_to_batch_kernel->configure(input, block_shape, paddings, output);
}
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
_fill_f = std::make_unique<NEFill>();
@@ -69,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_
_space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
return Status{};
}
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -87,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
void NESpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- if(_has_padding)
+ if (_has_padding)
{
_fill_f->run();
}
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 1e3776c448..846b619429 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,23 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
namespace arm_compute
{
NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
-NESpaceToDepthLayer::NESpaceToDepthLayer()
- : _space_to_depth_kernel()
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
{
}
void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
_space_to_depth_kernel = std::make_unique<NESpaceToDepthLayerKernel>();
_space_to_depth_kernel->configure(input, output, block_shape);
}
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index db19bbb824..53b09e9ae5 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -34,7 +34,7 @@ namespace arm_compute
{
void NESplit::run()
{
- for(unsigned i = 0; i < _num_outputs; ++i)
+ for (unsigned i = 0; i < _num_outputs; ++i)
{
_slice_functions[i].run();
}
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index af5c80d036..2f88ffca2a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStackLayerKernel.h"
namespace arm_compute
@@ -37,25 +39,18 @@ namespace arm_compute
NEStackLayer::~NEStackLayer() = default;
NEStackLayer::NEStackLayer() // NOLINT
- : _input(),
- _stack_kernels(),
- _num_inputs(0)
+ : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false)
{
}
void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
{
- _num_inputs = input.size();
- _stack_kernels.resize(_num_inputs);
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
- for(unsigned int i = 0; i < _num_inputs; i++)
- {
- _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
- _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
- }
+ _stack_kernel->configure(input, axis_u, output);
}
Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
@@ -67,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
const size_t rank = input[0]->num_dimensions();
const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
- const unsigned int num_inputs = input.size();
-
- for(unsigned int i = 0; i < num_inputs; i++)
- {
- // All the tensors must have the same rank
- ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
- // Validate Kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
- }
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output));
return Status{};
}
void NEStackLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ if (!_is_prepared)
{
- NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
+ _stack_kernel->prepare();
+ _is_prepared = true;
}
+
+ NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension());
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index fffb38c3ca..6a3ac8be05 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,24 +25,38 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+
auto k = std::make_unique<NEStridedSliceKernel>();
k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
_kernel = std::move(k);
}
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
}
@@ -50,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
struct NEStridedSlice::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
};
-NEStridedSlice::NEStridedSlice()
- : _impl(std::make_unique<Impl>())
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
{
}
-NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
NEStridedSlice::~NEStridedSlice() = default;
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensor *input,
+ ITensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
_impl->src = input;
_impl->dst = output;
@@ -81,10 +99,16 @@ void NEStridedSlice::run()
_impl->op->run(pack);
}
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 088816eb95..d10b1c8e95 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,15 @@
*/
#include "arm_compute/runtime/NEON/functions/NETile.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NETileKernel.h"
namespace arm_compute
{
void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
+
auto k = std::make_unique<NETileKernel>();
k->configure(input, output, multiples);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 3b3023f3b3..0144a85e8c 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -24,19 +24,20 @@
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuTranspose.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuTranspose.h"
namespace arm_compute
{
struct NETranspose::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuTranspose> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuTranspose> op{nullptr};
};
-NETranspose::NETranspose()
- : _impl(std::make_unique<Impl>())
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
{
}
@@ -45,6 +46,7 @@ NETranspose::~NETranspose() = default;
void NETranspose::configure(const ITensor *input, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 50596dbc0a..2f7ed2bb1f 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
namespace
@@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
}
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start,
+ int32_t &slice_end_mask,
+ const unsigned int input_num_dimensions)
{
// Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
Coordinates slice_end;
slice_start.set_num_dimensions(input_num_dimensions);
slice_end.set_num_dimensions(input_num_dimensions);
- for(size_t k = 0; k < input_num_dimensions; ++k)
+ for (size_t k = 0; k < input_num_dimensions; ++k)
{
slice_start.set(k, 0);
slice_end.set(k, -1);
@@ -54,22 +58,23 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
} // namespace
NEUnstack::NEUnstack() // NOLINT
- : _num_slices(0),
- _strided_slice_vector()
+ : _num_slices(0), _strided_slice_vector()
{
}
void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
{
std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
- std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(t);
- return t->info();
- });
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+ [](ITensor *t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+ ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
@@ -79,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou
Coordinates slice_start;
int32_t slice_end_mask;
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
- for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ for (unsigned int slice = 0; slice < _num_slices; ++slice)
{
// Adjusts start and end coordinates to take a 2D slice at a time
slice_start.set(axis_u, slice);
- _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+ slice_end_mask, (1 << axis_u));
}
}
@@ -100,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
Coordinates slice_start;
int32_t slice_end_mask;
- for(size_t k = 0; k < num_slices; ++k)
+ for (size_t k = 0; k < num_slices; ++k)
{
slice_start.set(wrap_axis(axis, input), k);
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
- ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask,
+ (1 << wrap_axis(axis, input))));
}
return Status{};
}
void NEUnstack::run()
{
- for(unsigned i = 0; i < _num_slices; ++i)
+ for (unsigned i = 0; i < _num_slices; ++i)
{
_strided_slice_vector[i].run();
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 0bf1738bec..7334be8456 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,759 +24,93 @@
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
namespace arm_compute
{
-namespace
-{
-inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
- if(input->data_type() == DataType::F32)
- {
- if(input_dims.width > 4 && input_dims.height > 4)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(input->data_type() == DataType::F16)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
+using namespace arm_compute::experimental;
-inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+struct NEWinogradConvolutionLayer::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
- const DataLayout data_layout = input->info()->data_layout();
- const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
- const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
- const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const int in_batches = input->info()->dimension(3);
-
- return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
- return INEWinogradLayerTransformWeightsKernel::validate(input, weights);
-}
-
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
-{
- Size2D output_tile = Size2D{};
- if(kernel_dims == Size2D(3U, 3U))
- {
- output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
- if(data_type == DataType::F16)
- {
- output_tile = Size2D(4U, 4U);
- }
- }
- else if(kernel_dims == Size2D(5U, 5U))
- {
- output_tile = Size2D(2U, 2U);
- }
- else if(kernel_dims == Size2D(1U, 3U))
- {
- output_tile = Size2D(1U, 6U);
- }
- else if(kernel_dims == Size2D(3U, 1U))
- {
- output_tile = Size2D(6U, 1U);
- }
- else if(kernel_dims == Size2D(1U, 5U))
- {
- output_tile = Size2D(1U, 4U);
- }
- else if(kernel_dims == Size2D(5U, 1U))
- {
- output_tile = Size2D(4U, 1U);
- }
- else if(kernel_dims == Size2D(7U, 1U))
- {
- output_tile = Size2D(2U, 1U);
- }
- else if(kernel_dims == Size2D(1U, 7U))
- {
- output_tile = Size2D(1U, 2U);
- }
- return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
-{
- // Check if we want to configure a Winograd configuration which requires fast math
- using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
- const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
- {
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
- };
-
- const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
- {
- WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
- };
-
- auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
- std::pair<int, int>(kernel_size.width, kernel_size.height));
-
- switch(data_type)
- {
- case DataType::F16:
- return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
- case DataType::F32:
- return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
- default:
- return false;
- }
-}
-
-inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
-{
- return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
-}
-
-arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
-{
- switch(act_info.activation())
- {
- case ActivationLayerInfo::ActivationFunction::RELU:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
- }
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
- }
- default:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::None);
- }
- }
-}
-} //namespace
+ MemoryGroup memory_group{};
+ std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+ const ITensor *original_weights{nullptr};
+ bool is_prepared{false};
+ bool is_activationlayer_enabled{false};
+ DataLayout data_layout{};
+};
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
- _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
- _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false), _data_layout()
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
- bool enable_fast_math)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
-
- // Get indices for the width and height
- _data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
- const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
- const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
- const DataType data_type = input->info()->data_type();
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
- "This Winograd configuration requires enable_fast_math=true");
- }
-
- _weights = weights;
- _input = input;
- _output = output;
- _is_prepared = false;
-
- int n_gemms = 1;
- int N_BLOCK = 1; // Size of block used by GEMM.
-
- std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel;
- std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel;
- std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel;
-
- if(data_type == DataType::F32)
- {
- if(kernel_size == Size2D(3, 3))
- {
- if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
- {
- using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- }
- else if(kernel_size == Size2D(5, 5))
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 3))
- {
- using config = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(3, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 5))
- {
- using config = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(5, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 7))
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(7, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(data_type == DataType::F16)
- {
- if(kernel_size == Size2D(3, 3))
- {
- using config = NEWinogradLayerConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
-
- const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
- const bool use_same_padding = use_padding_type == PADDING_SAME;
-
- // Get convolved dimensions
- const int in_channels = input->info()->dimension(channel_idx);
- const int out_channels = output->info()->dimension(channel_idx);
-
- const Tensor4DShape in_shape(internal_get_input_shape(input));
- const size_t data_type_size = input->info()->element_size();
- // Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
-
- // Kernel Storage
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
- in_channels)
- * data_type_size;
-
- // Input storage
- const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
- use_same_padding)
- * data_type_size;
-
- // Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
- const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-
- // Configure GEMM
- const int tile_rows = iceildiv(output_shape.first, output_tile.height);
- const int tile_cols = iceildiv(output_shape.second, output_tile.width);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
-
- TensorShape a_shape(k, m, 1, n_gemms);
- Strides a_strides(data_type_size);
- a_strides.set(1, a_strides[0] * k);
- //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
- a_strides.set(2, 0);
- a_strides.set(3, data_type_size * input_matrix_stride);
-
- TensorShape b_shape(n, k, n_gemms);
- Strides b_strides(data_type_size);
- b_strides.set(1, data_type_size * kernel_matrix_row_stride);
- b_strides.set(2, data_type_size * kernel_matrix_stride);
-
- TensorShape d_shape(n, m, 1, n_gemms);
- Strides d_strides(data_type_size);
- d_strides.set(1, data_type_size * output_matrix_row_stride);
- //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
- d_strides.set(2, 0);
- d_strides.set(3, data_type_size * output_matrix_stride);
-
- TensorInfo a_info{};
- TensorInfo b_info{};
- TensorInfo d_info{};
- a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
- b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
- d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
-
- _input_transformed.allocator()->init(a_info, storage_alignment);
- _kernel_storage.allocator()->init(b_info, storage_alignment);
- _output_transformed.allocator()->init(d_info, storage_alignment);
-
- // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
- _output->info()->dimension(1), _output->info()->dimension(3)),
- 1, _output->info()->data_type());
- _output_nhwc.allocator()->init(info);
-
- const ITensor *input_to_use = _input;
- ITensor *output_to_use = _output;
- PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
- const unsigned int max_num_threads = NEScheduler::get().num_threads();
-
- // Configure the kernel to transform the input tensor from NCHW -> NHWC
- if(_data_layout == DataLayout::NCHW)
- {
- _memory_group.manage(&_input_nhwc);
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- input_to_use = &_input_nhwc;
- weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
- }
-
- // Configure input transform kernel
- _memory_group.manage(&_input_transformed);
- _memory_group.manage(&_input_workspace);
- transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_transformed, input_matrix_stride, &_input_workspace);
- const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
- TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
- _input_workspace.allocator()->init(input_workspace_info);
- _input_workspace.allocator()->allocate();
- if(_data_layout == DataLayout::NCHW)
- {
- _input_nhwc.allocator()->allocate();
- }
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- // Configure GEMM function
- _memory_group.manage(&_output_transformed);
- _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
- _input_transformed.allocator()->allocate();
-
- // Configure output transform function
- // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- if(_data_layout == DataLayout::NCHW)
- {
- _memory_group.manage(&_output_nhwc);
- output_to_use = &_output_nhwc;
- }
- const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
-
- transform_output_kernel->configure(biases,
- &_output_transformed,
- output_matrix_stride,
- output_to_use,
- in_shape.n_batches,
- output_shape.first,
- output_shape.second,
- out_channels,
- &_output_workspace,
- activation);
+NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
- const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
- TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
- _output_workspace.allocator()->init(output_workspace_info);
- _output_workspace.allocator()->allocate();
- _output_transformed.allocator()->allocate();
-
- // Reorder the convoluted output to ACL's ordering NCHW
- if(_data_layout == DataLayout::NCHW)
- {
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
- _output_nhwc.allocator()->allocate();
- }
-
- _transform_input_kernel = std::move(transform_input_kernel);
- _transform_weights_kernel = std::move(transform_weights_kernel);
- _transform_output_kernel = std::move(transform_output_kernel);
+void NEWinogradConvolutionLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ _impl->original_weights = weights;
+ _impl->op = std::make_unique<cpu::CpuWinogradConv2d>();
+ _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ conv_info, act_info, enable_fast_math);
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info);
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(_output, nullptr, act_info);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
void NEWinogradConvolutionLayer::run()
{
prepare();
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_data_layout == DataLayout::NCHW)
- {
- //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _permute_input.run();
- }
-
- // Transform input tensor to the winograd domain
- NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
- //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- _gemm_function.run();
-
- // Transform output tensor to the spatial domain
- NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
- if(_data_layout == DataLayout::NCHW)
- {
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.run();
- }
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
-
- // Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- // Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
- const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
- const DataType data_type = input->data_type();
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
- "This Winograd configuration requires enable_fast_math=true");
- }
-
- const WinogradInfo winograd_info = WinogradInfo(output_tile,
- kernel_size,
- input_dims,
- conv_info,
- input->data_layout());
-
- // Validate input transform
- const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
- const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
- // Validate filter transform
- const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
- const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
- // Validate batched matrix multiply
- TensorShape batched_mm_output_shape = input0.tensor_shape();
- batched_mm_output_shape[0] = input1.tensor_shape()[0];
- const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-
- if(kernel_size == Size2D(3, 3))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(5, 5))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- if(kernel_size == Size2D(3, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 3))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(5, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 5))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(7, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 7))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
- }
+ return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
}
void NEWinogradConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- // Permute weights
- _weights_hwio.allocator()->allocate();
- _permute_weights.run();
- _weights->mark_as_unused();
-
- // Transform weights
- _kernel_storage.allocator()->allocate();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- _weights_hwio.allocator()->free();
+ _impl->op->prepare(_impl->prep_pack);
+ _impl->original_weights->mark_as_unused();
- _gemm_function.prepare();
- if(!_kernel_storage.is_used())
- {
- _kernel_storage.allocator()->free();
- }
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
- _is_prepared = true;
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index ca763f907b..2a5abb5f7a 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,29 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+
#include <omp.h>
namespace arm_compute
{
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+OMPScheduler::OMPScheduler() // NOLINT
+ : _num_threads(cpu_info().get_cpu_num_excluding_little()),
+ _has_lmb(cpu_info().cpu_has_little_mid_big()),
+ _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
+{
+}
+#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
OMPScheduler::OMPScheduler() // NOLINT
- : _num_threads(omp_get_max_threads())
+ : _num_threads(omp_get_max_threads()),
+ _has_lmb(cpu_info().cpu_has_little_mid_big()),
+ _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
{
}
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
unsigned int OMPScheduler::num_threads() const
{
@@ -44,7 +59,15 @@ unsigned int OMPScheduler::num_threads() const
void OMPScheduler::set_num_threads(unsigned int num_threads)
{
const unsigned int num_cores = omp_get_max_threads();
- _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+ const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads;
+ _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads;
+#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+ _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+ (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
}
void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
@@ -63,20 +86,20 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
const unsigned int num_threads = std::min(num_iterations, _num_threads);
- if(!kernel->is_parallelisable() || num_threads == 1)
+ if (!kernel->is_parallelisable() || num_threads == 1)
{
ThreadInfo info;
- info.cpu_info = &_cpu_info;
+ info.cpu_info = &cpu_info();
kernel->run_op(tensors, max_window, info);
}
else
{
const unsigned int num_windows = num_threads;
std::vector<IScheduler::Workload> workloads(num_windows);
- for(unsigned int t = 0; t < num_windows; t++)
+ for (unsigned int t = 0; t < num_windows; t++)
{
//Capture 't' by copy, all the other variables by reference:
- workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+ workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
{
Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
win.validate();
@@ -89,20 +112,25 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
#ifndef DOXYGEN_SKIP_THIS
void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
{
- const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
- if(num_threads < 1)
+ const unsigned int amount_of_work = static_cast<unsigned int>(workloads.size());
+ const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work);
+
+ if (num_threads_to_use < 1)
{
return;
}
ThreadInfo info;
- info.cpu_info = &_cpu_info;
- info.num_threads = num_threads;
- #pragma omp parallel firstprivate(info) num_threads(num_threads)
+ info.cpu_info = &cpu_info();
+ info.num_threads = num_threads_to_use;
+#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+ schedule(static, 1)
+ for (unsigned int wid = 0; wid < amount_of_work; ++wid)
{
- const int tid = omp_get_thread_num();
+ const int tid = omp_get_thread_num();
+
info.thread_id = tid;
- workloads[tid](info);
+ workloads[wid](info);
}
}
#endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index a47fa184fa..d746f618b5 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment)
return (remainder != 0U) ? offset + (alignment - remainder) : offset;
}
} // namespace
-OffsetLifetimeManager::OffsetLifetimeManager()
- : _blob(0)
+OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0)
{
}
@@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings()
// Update blob size
size_t max_aggregated_size = 0;
- std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
- {
- max_aggregated_size += b.max_size;
- _blob.alignment = std::max(_blob.alignment, b.max_alignment);
- });
+ std::for_each(std::begin(_free_blobs), std::end(_free_blobs),
+ [&](const Blob &b)
+ {
+ max_aggregated_size += b.max_size;
+ _blob.alignment = std::max(_blob.alignment, b.max_alignment);
+ });
max_aggregated_size += _free_blobs.size() * _blob.alignment;
_blob.owners = std::max(_blob.owners, _free_blobs.size());
_blob.size = std::max(_blob.size, max_aggregated_size);
// Calculate group mappings
- auto &group_mappings = _active_group->mappings();
+ auto &group_mappings = _active_group->mappings();
size_t offset = 0;
- for(auto &free_blob : _free_blobs)
+ for (auto &free_blob : _free_blobs)
{
- for(auto &bound_element_id : free_blob.bound_elements)
+ for (auto &bound_element_id : free_blob.bound_elements)
{
ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
Element &bound_element = _active_elements[bound_element_id];
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index ffedf5586c..8f3c1a84ba 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -21,8 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include <algorithm>
-
#include "arm_compute/runtime/OffsetMemoryPool.h"
#include "arm_compute/core/Error.h"
@@ -31,6 +29,8 @@
#include "arm_compute/runtime/MemoryRegion.h"
#include "arm_compute/runtime/Types.h"
+#include <algorithm>
+
namespace arm_compute
{
OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
@@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
ARM_COMPUTE_ERROR_ON(_blob == nullptr);
// Set memory to handlers
- for(auto &handle : handles)
+ for (auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
@@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
void OffsetMemoryPool::release(MemoryMappings &handles)
{
- for(auto &handle : handles)
+ for (auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
handle.first->set_region(nullptr);
diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
index a8ad53da90..19415b35cf 100644
--- a/src/runtime/OperatorTensor.cpp
+++ b/src/runtime/OperatorTensor.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/OperatorTensor.h"
+
#include "arm_compute/runtime/MemoryRegion.h"
#include "support/Cast.h"
@@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info()
uint8_t *OperatorTensor::buffer() const
{
- switch(_mem_type)
+ switch (_mem_type)
{
case MemoryType::CPU:
return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer();
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 87376a71a4..7fb9bd8000 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -31,8 +31,7 @@
using namespace arm_compute;
-PoolManager::PoolManager()
- : _free_pools(), _occupied_pools(), _sem(), _mtx()
+PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx()
{
}
@@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool)
ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
- auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
- {
- return pool_it.get() == pool;
- });
+ auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools),
+ [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; });
ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
_free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
_sem->signal();
@@ -78,7 +75,7 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool()
arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
- if(!_free_pools.empty())
+ if (!_free_pools.empty())
{
std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index d1dea066e7..1de8d2abdb 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp
@@ -28,8 +28,7 @@
namespace arm_compute
{
-RuntimeContext::RuntimeContext()
- : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
+RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
{
}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a2ad..3f1e96968a 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,7 +76,7 @@ void Scheduler::set(Type t)
bool Scheduler::is_available(Type t)
{
- if(t == Type::CUSTOM)
+ if (t == Type::CUSTOM)
{
return _custom_scheduler != nullptr;
}
@@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type()
IScheduler &Scheduler::get()
{
- if(_scheduler_type == Type::CUSTOM)
+ if (_scheduler_type == Type::CUSTOM)
{
- if(_custom_scheduler == nullptr)
+ if (_custom_scheduler == nullptr)
{
- ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+ ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) "
+ "before Scheduler::get()");
}
else
{
@@ -106,13 +107,13 @@ IScheduler &Scheduler::get()
}
else
{
- if(_schedulers.empty())
+ if (_schedulers.empty())
{
_schedulers = init();
}
auto it = _schedulers.find(_scheduler_type);
- if(it != _schedulers.end())
+ if (it != _schedulers.end())
{
return *it->second;
}
diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index cc21d62630..4fb08d79f5 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp
@@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory:
std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
{
- switch(type)
+ switch (type)
{
case Type::ST:
{
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 6f9a32c879..74ee539fec 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp
@@ -47,35 +47,34 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
double ratio = m / static_cast<double>(n);
// nt = sqrt(max_threads * (m / n) )
- const unsigned adjusted = std::round(
- std::sqrt(max_threads * ratio));
+ const unsigned adjusted = std::round(std::sqrt(max_threads * ratio));
//find the nearest factor of max_threads
- for(unsigned i = 0; i != adjusted; ++i)
+ for (unsigned i = 0; i != adjusted; ++i)
{
//try down
const unsigned adj_down = adjusted - i;
- if(max_threads % adj_down == 0)
+ if (max_threads % adj_down == 0)
{
- return { adj_down, max_threads / adj_down };
+ return {adj_down, max_threads / adj_down};
}
//try up
const unsigned adj_up = adjusted + i;
- if(max_threads % adj_up == 0)
+ if (max_threads % adj_up == 0)
{
- return { adj_up, max_threads / adj_up };
+ return {adj_up, max_threads / adj_up};
}
}
//we didn't find anything so lets bail out with maxes biased to the largest dimension
- if(m > n)
+ if (m > n)
{
- return { std::min<unsigned>(m, max_threads), 1 };
+ return {std::min<unsigned>(m, max_threads), 1};
}
else
{
- return { 1, std::min<unsigned>(n, max_threads) };
+ return {1, std::min<unsigned>(n, max_threads)};
}
}
#endif /* #ifndef BARE_METAL */
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index ae16c8be0a..f87256abb1 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -27,8 +27,7 @@
using namespace arm_compute;
-SubTensor::SubTensor()
- : _parent(nullptr), _info()
+SubTensor::SubTensor() : _parent(nullptr), _info()
{
}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 6dcef9f0b5..f17e323694 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -25,8 +25,7 @@
namespace arm_compute
{
-Tensor::Tensor(IRuntimeContext *)
- : _allocator(this)
+Tensor::Tensor(IRuntimeContext *) : _allocator(this)
{
}
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 4ae27c59fc..372852bfea 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
const size_t parent_dims = parent_info.num_dimensions();
const size_t child_dims = child_info.num_dimensions();
- if(child_dims <= parent_dims)
+ if (child_dims <= parent_dims)
{
- for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+ for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
{
const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
- if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+ if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
{
is_valid = false;
break;
@@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
}
} // namespace
-TensorAllocator::TensorAllocator(IMemoryManageable *owner)
- : _owner(owner), _associated_memory_group(nullptr), _memory()
+TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory()
{
}
@@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
{
- if(&o != this)
+ if (&o != this)
{
_owner = o._owner;
o._owner = nullptr;
@@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &
_memory = Memory(allocator._memory.region());
// Init tensor info with new dimensions
- size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
- sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+ size_t total_size =
+ parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+ sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(),
+ parent_info.offset_element_in_bytes(coords), total_size);
// Set TensorInfo
init(sub_info);
@@ -133,7 +134,7 @@ void TensorAllocator::allocate()
{
// Align to 64-byte boundaries by default if alignment is not specified
const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64;
- if(_associated_memory_group == nullptr)
+ if (_associated_memory_group == nullptr)
{
_memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 15e9d43a49..a7f7b5f3cb 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -41,20 +41,17 @@ static const std::string information =
const std::string &string_from_scheduler_type(Scheduler::Type t)
{
- static std::map<Scheduler::Type, const std::string> scheduler_type_map =
- {
- { Scheduler::Type::ST, "Single Thread" },
- { Scheduler::Type::CPP, "C++11 Threads" },
- { Scheduler::Type::OMP, "OpenMP Threads" },
- { Scheduler::Type::CUSTOM, "Custom" }
- };
+ static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"},
+ {Scheduler::Type::CPP, "C++11 Threads"},
+ {Scheduler::Type::OMP, "OpenMP Threads"},
+ {Scheduler::Type::CUSTOM, "Custom"}};
return scheduler_type_map[t];
}
void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints)
{
- if(ctx)
+ if (ctx)
{
ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr);
ctx->scheduler()->schedule(kernel, hints);
@@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch
unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
{
// We need only 1 stage for all axis except x-axis
- if(axis != 0)
+ if (axis != 0)
{
return 1;
}
diff --git a/src/runtime/cpu/ICpuOperator.h b/src/runtime/cpu/ICpuOperator.h
deleted file mode 100644
index 70ab4364c7..0000000000
--- a/src/runtime/cpu/ICpuOperator.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPUOPERATOR_H
-#define ARM_COMPUTE_ICPUOPERATOR_H
-
-#include "arm_compute/runtime/NEON/INEOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using ICpuOperator = experimental::INEOperator;
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICPUOPERATOR_H */
diff --git a/src/runtime/cpu/operators/CpuActivation.cpp b/src/runtime/cpu/operators/CpuActivation.cpp
deleted file mode 100644
index 7753c9601f..0000000000
--- a/src/runtime/cpu/operators/CpuActivation.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include "src/core/cpu/kernels/CpuActivationKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
-{
- auto k = std::make_unique<kernels::CpuActivationKernel>();
- k->configure(input, output, activation_info);
- _kernel = std::move(k);
-}
-
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
-{
- return kernels::CpuActivationKernel::validate(input, output, activation_info);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuActivation.h b/src/runtime/cpu/operators/CpuActivation.h
deleted file mode 100644
index 0ae16bf958..0000000000
--- a/src/runtime/cpu/operators/CpuActivation.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_H
-#define ARM_COMPUTE_CPU_ACTIVATION_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuActivationKernel */
-class CpuActivation : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuActivation() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[out] output Destination tensor info. Data type supported: same as @p src
- * @param[in] activation_info Activation layer parameters.
- */
- void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuActivation
- *
- * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[in] output Destination tensor info. Data type supported: same as @p src
- * @param[in] act_info Activation layer information.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/runtime/cpu/operators/CpuAdd.cpp b/src/runtime/cpu/operators/CpuAdd.cpp
deleted file mode 100644
index 23b09aca4f..0000000000
--- a/src/runtime/cpu/operators/CpuAdd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuAdd.h"
-
-#include "src/core/cpu/kernels/CpuAddKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<kernels::CpuAddKernel>();
- k->configure(src0, src1, dst, policy);
- _kernel = std::move(k);
-}
-
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuAdd.h b/src/runtime/cpu/operators/CpuAdd.h
deleted file mode 100644
index 8ae7833f01..0000000000
--- a/src/runtime/cpu/operators/CpuAdd.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ADD_H
-#define ARM_COMPUTE_CPU_ADD_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuAddKernel */
-class CpuAdd : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuAdd() = default;
- /** Initialise the kernel's input, dst and border mode.
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
- * @param[in] policy Overflow policy.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- *
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CpuAdd
- *
- * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
- * @param[in] policy Overflow policy.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ADD_H */
diff --git a/src/runtime/cpu/operators/CpuCast.h b/src/runtime/cpu/operators/CpuCast.h
deleted file mode 100644
index 2aea2d2b09..0000000000
--- a/src/runtime/cpu/operators/CpuCast.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CAST_H
-#define ARM_COMPUTE_CPU_CAST_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuCastKernel */
-class CpuCast : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuCast() = default;
- /** Configure operator for a given list of arguments
- *
- * Input data type must be different than output data type.
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src |dst |
- * |:--------------|:-----------------------------------------------|
- * |QASYMM8_SIGNED | S16, S32, F32, F16 |
- * |QASYMM8 | U16, S16, S32, F32, F16 |
- * |U8 | U16, S16, S32, F32, F16 |
- * |U16 | U8, U32 |
- * |S16 | QASYMM8_SIGNED, U8, S32 |
- * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 |
- * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 |
- * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
- *
- * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuCast::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/runtime/cpu/operators/CpuConcatenate.cpp b/src/runtime/cpu/operators/CpuConcatenate.cpp
deleted file mode 100644
index 23eb3fceab..0000000000
--- a/src/runtime/cpu/operators/CpuConcatenate.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuConcatenate.h"
-
-#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuConcatenate::CpuConcatenate()
- : _concat_kernels(), _num_srcs(0), _axis(0)
-{
-}
-
-void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis)
-{
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
-
- _axis = axis;
- _num_srcs = srcs_vector.size();
-
- TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type());
- ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis));
-
- unsigned int offset = 0;
-
- for(unsigned int i = 0; i < _num_srcs; ++i)
- {
- switch(axis)
- {
- case Window::DimX:
- {
- auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>();
- kernel->configure(srcs_vector.at(i), offset, dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- case Window::DimY:
- {
- auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>();
- kernel->configure(srcs_vector.at(i), offset, dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- case Window::DimZ:
- {
- auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>();
- kernel->configure(srcs_vector.at(i), offset, dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- case 3:
- {
- auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>();
- kernel->configure(srcs_vector.at(i), offset, dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Axis not supported");
- }
- offset += srcs_vector.at(i)->dimension(axis);
- }
-}
-
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
- ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
-
- unsigned int offset = 0;
- for(const auto &src : srcs_vector)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- switch(axis)
- {
- case Window::DimX:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst));
- break;
- }
- case Window::DimY:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst));
- break;
- }
- case Window::DimZ:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst));
- break;
- }
- case 3:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Axis not supported");
- }
- offset += src->dimension(axis);
- }
-
- if(dst->total_size() != 0)
- {
- TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
- ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
- }
-
- return Status{};
-}
-
-void CpuConcatenate::run(ITensorPack &tensors)
-{
- if(tensors.empty())
- {
- ARM_COMPUTE_ERROR("No inputs provided");
- }
-
- if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
- {
- ARM_COMPUTE_ERROR("Configured with different number of inputs");
- }
-
- int i = 0;
- for(auto &k : _concat_kernels)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
- pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
- NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack);
- ++i;
- }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConcatenate.h b/src/runtime/cpu/operators/CpuConcatenate.h
deleted file mode 100644
index d2af3e2ad2..0000000000
--- a/src/runtime/cpu/operators/CpuConcatenate.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_H
-#define ARM_COMPUTE_CPU_CONCATENATE_H
-
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0).
- * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1).
- * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2).
- * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3).
- */
-class CpuConcatenate : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuConcatenate();
- /** Configure operator for a given list of arguments
- *
- * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
- * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel,
- * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel.
- *
- * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector.
- * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
- */
- void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis);
- /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
- *
- * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
- * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel,
- * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel.
- *
- * @param[in] srcs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] dst Output tensor info. Data types supported: Same as @p srcs_vector.
- * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
- *
- * @return a status
- */
- static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-
-private:
- std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels;
- unsigned int _num_srcs;
- unsigned int _axis;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */
diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp
deleted file mode 100644
index 3f2f4e95cf..0000000000
--- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
- auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
- k->configure(src, dst, original_src_shape, data_layout);
- _kernel = std::move(k);
-}
-
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
- return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
-}
-
-void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
-{
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h
deleted file mode 100644
index 3f1ddf1dbe..0000000000
--- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H
-#define ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */
-class CpuConvertFullyConnectedWeights : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuConvertFullyConnectedWeights() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[out] dst Destintation tensor. Data types supported: Same as @p src
- * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConvertFullyConnectedWeights
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: Same as @p dst
- * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/src/runtime/cpu/operators/CpuCopy.cpp b/src/runtime/cpu/operators/CpuCopy.cpp
deleted file mode 100644
index 9fbe916163..0000000000
--- a/src/runtime/cpu/operators/CpuCopy.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuCopy.h"
-
-#include "src/core/cpu/kernels/CpuCopyKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuCopyKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuCopyKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
deleted file mode 100644
index 160a9fd70b..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- if(!is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
- const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
- info.pad_stride_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
- info.pad_stride_info.pad_bottom());
-
- if(biases != nullptr)
- {
- const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
- }
-
- ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
-
- //Validate Activation Layer
- if(info.act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
- }
- return Status{};
-}
-} // namespace
-
-CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::CpuDepthwiseConv2dOptimizedInternal()
- : _dwc_optimized_func(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _has_bias(false), _is_quantized(false),
- _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
-
- _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
- _has_bias = biases != nullptr;
- _is_nchw = src->data_layout() == DataLayout::NCHW;
- _permute = _is_nchw;
- _is_prepared = false;
-
- // Configure pipeline
- ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
- _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6);
-
- if(!_is_activationlayer_enabled)
- {
- act_info_to_use = info.act_info;
- }
-
- _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
- if(_is_nchw)
- {
- _permute_input = std::make_unique<cpu::CpuPermute>();
- _permute_weights = std::make_unique<cpu::CpuPermute>();
- _permute_output = std::make_unique<cpu::CpuPermute>();
-
- auto input_perm = std::make_unique<TensorInfo>();
- auto weights_perm = std::make_unique<TensorInfo>();
- auto output_perm = std::make_unique<TensorInfo>();
-
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
- input_perm->set_data_layout(DataLayout::NHWC);
-
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
- weights_perm->set_data_layout(DataLayout::NHWC);
-
- output_perm->set_data_layout(DataLayout::NHWC);
- output_perm->set_quantization_info(dst->quantization_info());
-
- // Configure optimized depthwise
- _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
-
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- output_perm->set_data_layout(DataLayout::NHWC);
- _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
- }
- else
- {
- _dwc_optimized_func->configure(src, weights, biases, dst, info);
- }
-
- // Configure activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function = std::make_unique<cpu::CpuActivation>();
- _activationlayer_function->configure(dst, nullptr, info.act_info);
- }
-}
-
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- const ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- return validate_arguments_optimized(src, weights, biases, dst, info);
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- prepare(tensors);
-
- auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
- auto workspace = tensors.get_tensor(TensorType::ACL_INT_3);
- auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
- // Permute input
- if(_permute)
- {
- ITensorPack pack;
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
- pack.add_tensor(TensorType::ACL_SRC, src);
- pack.add_tensor(TensorType::ACL_DST, src_perm);
- _permute_input->run(pack);
- }
-
- // Run assembly function
- if(_is_nchw)
- {
- auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
- auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
- auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
- pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
- pack.add_tensor(TensorType::ACL_SRC_2, bias);
- pack.add_tensor(TensorType::ACL_INT_0, workspace);
- pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
- pack.add_tensor(TensorType::ACL_DST, dst_perm);
- _dwc_optimized_func->run(pack);
- }
- else
- {
- auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
- auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC_0, src);
- pack.add_tensor(TensorType::ACL_SRC_1, weights);
- pack.add_tensor(TensorType::ACL_SRC_2, bias);
- pack.add_tensor(TensorType::ACL_INT_0, workspace);
- pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _dwc_optimized_func->run(pack);
- }
-
- // Permute output
- if(_is_nchw)
- {
- ITensorPack pack;
- auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
- pack.add_tensor(TensorType::ACL_SRC, dst_perm);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _permute_output->run(pack);
- }
-
- // Run activation
- if(_is_activationlayer_enabled)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, dst);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _activationlayer_function->run(pack);
- }
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
-{
- if(!_is_prepared)
- {
- auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
- // Permute weights
- if(_permute)
- {
- auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, weights);
- pack.add_tensor(TensorType::ACL_DST, permuted_weights);
- _permute_weights->run(pack);
-
- weights->mark_as_unused();
-
- ITensorPack pack_opt;
- pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
- pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
- pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
- // Prepare optimized function
- _dwc_optimized_func->prepare(pack_opt);
- }
- else
- {
- ITensorPack pack_opt;
- pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
- pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
- pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
- // Prepare optimized function
- _dwc_optimized_func->prepare(pack_opt);
- }
-
- _is_prepared = true;
- }
-}
-
-CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::CpuDepthwiseConv2dGeneric()
- : _depthwise_conv_kernel(nullptr), _permute_input(nullptr), _permute_weights(nullptr), _permute_output(nullptr), _activationlayer_function(nullptr), _is_nchw(true), _is_prepared(false),
- _is_activationlayer_enabled(false)
-{
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
-
- _is_nchw = src->data_layout() == DataLayout::NCHW;
- _is_prepared = !_is_nchw;
-
- ITensorInfo *input_to_use = src;
- const ITensorInfo *weights_to_use = weights;
- ITensorInfo *output_to_use = dst;
-
- auto input_perm = std::make_unique<TensorInfo>();
- auto weights_perm = std::make_unique<TensorInfo>();
- auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-
- if(_is_nchw)
- {
- _permute_input = std::make_unique<cpu::CpuPermute>();
- _permute_weights = std::make_unique<cpu::CpuPermute>();
-
- _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
- input_perm->set_data_layout(DataLayout::NHWC);
- input_to_use = input_perm.get();
-
- _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
- weights_perm->set_data_layout(DataLayout::NHWC);
- weights_to_use = weights_perm.get();
-
- output_to_use = output_perm.get();
- }
-
- _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
- _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
-
- if(_is_nchw)
- {
- _permute_output = std::make_unique<cpu::CpuPermute>();
- _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
- output_perm->set_data_layout(DataLayout::NHWC);
- }
-
- //Configure Activation Layer
- _is_activationlayer_enabled = info.act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function = std::make_unique<cpu::CpuActivation>();
- _activationlayer_function->configure(dst, nullptr, info.act_info);
- }
-}
-
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- if(src->data_layout() == DataLayout::NCHW)
- {
- TensorShape permuted_input_shape = src->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
- permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
- permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
- const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
- ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
- ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
- ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
- }
-
- // Validate Activation Layer
- if(info.act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
- }
-
- return Status{};
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
-{
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
-
- if(_is_nchw)
- {
- prepare(tensors);
- auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
- auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
- auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, src);
- pack.add_tensor(TensorType::ACL_DST, src_perm);
- _permute_input->run(pack);
-
- ITensorPack pack_depth;
- pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
- pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
- pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
- pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
- }
- else
- {
- ITensorPack pack_depth;
- pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
- pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
- pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
- pack_depth.add_tensor(TensorType::ACL_DST, dst);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
- }
-
- if(_is_nchw)
- {
- ITensorPack pack;
- auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
- pack.add_tensor(TensorType::ACL_SRC, dst_perm);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _permute_output->run(pack);
- }
-
- if(_is_activationlayer_enabled)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, dst);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _activationlayer_function->run(pack);
- }
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
-{
- if(!_is_prepared)
- {
- auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-
- ARM_COMPUTE_ERROR_ON(!weights->is_used());
-
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, weights);
- pack.add_tensor(TensorType::ACL_DST, weights_perm);
-
- _permute_weights->run(pack);
- weights->mark_as_unused();
- _is_prepared = true;
- }
-}
-
-CpuDepthwiseConv2d::CpuDepthwiseConv2d()
- : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(), _func_generic()
-{
-}
-
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
- _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.configure(src, weights, biases, dst, info);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.configure(src, weights, biases, dst, info);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
- }
-}
-
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
- DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
- switch(depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
- }
-}
-
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
- {
- return DepthwiseConvolutionFunction::OPTIMIZED;
- }
- else
- {
- return DepthwiseConvolutionFunction::GENERIC;
- }
-}
-
-void CpuDepthwiseConv2d::run(ITensorPack &tensors)
-{
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.run(tensors);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.run(tensors);
- break;
- default:
- ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
- }
-}
-
-void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
-{
- switch(_depth_conv_func)
- {
- case DepthwiseConvolutionFunction::OPTIMIZED:
- _func_optimized.prepare(tensors);
- break;
- case DepthwiseConvolutionFunction::GENERIC:
- _func_generic.prepare(tensors);
- break;
- default:
- ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
- }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
deleted file mode 100644
index 049397fe60..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONV2D_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to execute a depthwise convolution.
- */
-class CpuDepthwiseConv2d : public ICpuOperator
-{
-public:
- /** Default constructor */
- CpuDepthwiseConv2d();
- /** Initialize the function's source, destination, weights and convolution information.
- *
- * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[out] dst Destination tensor info. Data type supported: same as @p src.
- * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] info Depthwise convolution meta-data.
- */
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDepthwiseConv2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
- /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
- *
- * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] dst Destination tensor. Data type supported: same as @p src.
- * @param[in] info Depthwise convolution meta-data.
- *
- * @return a Depthwise Convolution Function
- */
- static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const ConvolutionInfo &info);
-
- // Inherited methods overriden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
-
-private:
- /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
- *
- * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
- *
- * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
- * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
- * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
- * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required
- * -# @ref NEActivationLayer if fused activation is required
- *
- */
- class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
- {
- public:
- /** Default constructor */
- CpuDepthwiseConv2dOptimizedInternal();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
- /** Default move constructor */
- CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
- /** Default move assignment operator */
- CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
- /** Default destructor */
- ~CpuDepthwiseConv2dOptimizedInternal() = default;
- /** Initialize the function's source, destination, kernels and border_size.
- *
- * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
- * @param[out] dst Destination tensor info. Data type supported: same as @p src.
- * @param[in] info Depthwise convolution meta-data.
- */
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
- // Inherited methods overriden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
-
- private:
- std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _has_bias{ false };
- bool _is_quantized{ false };
- bool _is_nchw{ true };
- bool _permute{ false };
- bool _is_activationlayer_enabled{ false };
- bool _is_prepared{ false };
- };
-
- /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
- *
- * -# @ref CpuDepthwiseConv2dNativeKernel
- *
- */
- class CpuDepthwiseConv2dGeneric : public ICpuOperator
- {
- public:
- /** Default constructor */
- CpuDepthwiseConv2dGeneric();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
- /** Default move constructor */
- CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
- /** Default move assignment operator */
- CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
- /** Default destructor */
- ~CpuDepthwiseConv2dGeneric() = default;
- /** Initialize the function's source, destination, weights and convolution information.
- *
- * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
- * @param[out] dst Destination tensor info. Data type supported: same as @p src.
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
- * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] info Depthwise convolution meta-data.
- */
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDepthwiseConv2dGeneric::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
-
- private:
- std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _is_nchw{ true };
- bool _is_prepared{ false };
- bool _is_activationlayer_enabled{ false };
- };
-
- DepthwiseConvolutionFunction _depth_conv_func;
- CpuDepthwiseConv2dOptimizedInternal _func_optimized;
- CpuDepthwiseConv2dGeneric _func_generic;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
deleted file mode 100644
index a36ee1d45b..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
- int n_batches, int in_rows, int in_cols, int n_channels,
- int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
- const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
- const qasymm8::QAsymm8RescaleParams &rescale_params,
- int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
- switch(kernel_size)
- {
- case 3:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- case 5:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- default:
- return nullptr;
- }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
- int n_batches, int in_rows, int in_cols, int n_channels,
- neon_convolution_kernels::ActivationFunction activation,
- const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
- const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
- int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
- switch(kernel_size)
- {
- case 3:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- case 5:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- default:
- return nullptr;
- }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
- int n_batches, int in_rows, int in_cols, int n_channels,
- int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
- int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
- switch(kernel_size)
- {
- case 3:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- case 5:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- default:
- return nullptr;
- }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
- int n_batches, int in_rows, int in_cols, int n_channels,
- int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
- int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
- switch(kernel_size)
- {
- case 3:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- case 5:
- {
- switch(stride_x)
- {
- case 1:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- case 2:
- return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- default:
- return nullptr;
- }
- }
- default:
- return nullptr;
- }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo *src,
- const ITensorInfo *weights,
- ITensorInfo *output,
- const ConvolutionInfo &info)
-{
- const DataType data_type = src->data_type();
- const TensorShape shape = src->tensor_shape();
-
- const int n_batches = shape[3];
- const int in_rows = shape.z();
- const int in_cols = shape.y();
- const int n_channels = shape.x();
- const int dilation_factor = info.dilation.x();
- const int padding_top = info.pad_stride_info.pad_top();
- const int padding_left = info.pad_stride_info.pad_left();
- const int padding_bottom = info.pad_stride_info.pad_bottom();
- const int padding_right = info.pad_stride_info.pad_right();
-
- const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
- const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
- const unsigned int stride_x = info.pad_stride_info.stride().first;
- const unsigned int kernel_size = weights->tensor_shape().y();
-
- // Map activation function
- neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
- if(arm_compute::utils::info_helpers::is_relu(info.act_info))
- {
- activation = neon_convolution_kernels::ActivationFunction::ReLU;
- }
- else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
- {
- activation = neon_convolution_kernels::ActivationFunction::ReLU6;
- }
-
- // Create quantized convolver
- if(is_uniform_quantized)
- {
- const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform();
- const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
- const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
-
- // Check that quantization info are in the range [0, 255]
- ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
- ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
- ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
- const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
- const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
- const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
- // Calculate rescale parameters
- const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int32_t qmultiplier = 0;
- int32_t qshift = 0;
- quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
- qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
- return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
- wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- }
- else if(is_perchannel_quantized)
- {
- const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform();
- const QuantizationInfo weights_qinfo = weights->quantization_info();
- const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
-
- // Check that quantization info are in the range [0, 255]
- ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
- ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
- const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
- const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
- const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
- // Calculate rescale parameters
- std::vector<float> fmultipliers;
- std::vector<int32_t> qmultipliers;
- std::vector<int32_t> qshifts;
-
- for(auto const s : wqinfo.scales)
- {
- const float fmultipler = iqinfo.scale * s / oqinfo.scale;
- int32_t qmultiplier = 0;
- int32_t qshift = 0;
- quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
- fmultipliers.push_back(fmultipler);
- qmultipliers.push_back(qmultiplier);
- qshifts.push_back(qshift);
- }
-
- qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
- return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
- wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
- }
- else
- {
- // Create float convolver
- switch(data_type)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- {
- return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
- }
- default:
- return nullptr;
- }
- }
-}
-} // namespace
-
-struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
-{
- std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
- NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{};
- bool is_prepared{ false };
- experimental::MemoryRequirements mem_req{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
- : _pImpl(std::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
-
-void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
- const ITensorInfo *weights,
- const ITensorInfo *bias,
- ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_UNUSED(bias);
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src,
- weights,
- bias != nullptr ? bias : nullptr,
- dst,
- info));
-
- // Output auto inizialitation if not yet initialized
- const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info()));
-
- _pImpl->is_prepared = false;
-
- // Create convolver
- _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info);
- ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
-
- // Create assembly kernel wrapper
- _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
-
- constexpr size_t alignment = 128;
-
- // Create workspace
- const unsigned int num_threads = NEScheduler::get().num_threads();
- const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
- ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
- // Create packing tensor
- const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
- ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
-}
-
-experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
-{
- return _pImpl->mem_req;
-}
-
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src,
- const ITensorInfo *weights,
- const ITensorInfo *bias,
- const ITensorInfo *dst,
- const ConvolutionInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-
- // Validate convolver
- ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info));
-
- // Validate activation
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
- ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
-
- // Check bias
- if(bias != nullptr)
- {
- unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
- }
-
- // Check output
- if(dst->total_size() != 0)
- {
- const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- // The uniform quantization case will only have 1 scale value in the weights quantization info
- const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform();
- const QuantizationInfo weights_qinfo = weights->quantization_info();
- const UniformQuantizationInfo dst_qinfo = dst->quantization_info().uniform();
- for(auto const s : weights_qinfo.scale())
- {
- const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale;
- ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
- }
-
- return Status{};
-}
-
-bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo *src,
- const ITensorInfo *weights,
- const ConvolutionInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-
- // Reshape input shape if in NHWC format
- const DataLayout data_layout = src->data_layout();
- TensorShape in_shape{ src->tensor_shape() };
- if(data_layout == DataLayout::NHWC)
- {
- in_shape.set(Window::DimX, src->tensor_shape().y());
- in_shape.set(Window::DimY, src->tensor_shape().z());
- in_shape.set(Window::DimZ, src->tensor_shape().x());
- }
-
- // Check data type
- const DataType input_type = src->data_type();
- const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
- const DataType weights_type = weights->data_type();
- const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
- || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
- // Check weighs size
- std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_w = weights->dimension(width_idx);
- const unsigned int kernel_h = weights->dimension(height_idx);
- bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
- // Check for supported strides
- const auto &strides = info.pad_stride_info.stride();
- bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
- // Check for supported padding
- const auto pad_top = info.pad_stride_info.pad_top();
- const auto pad_right = info.pad_stride_info.pad_right();
- const auto pad_bottom = info.pad_stride_info.pad_bottom();
- const auto pad_left = info.pad_stride_info.pad_left();
- PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
- bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
- bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
- bool supported_padding = is_same_padding || is_valid_padding;
- // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
- bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
-
- if(weights_type == DataType::QSYMM8_PER_CHANNEL)
- {
- is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
- }
-
- return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
-}
-
-void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
-{
- // Prepare assembly kernel
- prepare(tensors);
-
- auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
- auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- // Setup inputs/outputs
- ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
- _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
-
- ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
- const int input_element_size = src->info()->element_size();
- const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
- const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size;
- const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size;
- const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
- _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
- ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
- const int output_element_size = dst->info()->element_size();
- const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
- const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size;
- const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size;
- void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
- _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
- // Schedule assembly kernel
- NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
-}
-
-void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
-{
- if(!_pImpl->is_prepared)
- {
- auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
- ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
-
- // Pack weights and bias
- const int weights_element_size = weights->info()->element_size();
- const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size;
- const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size;
- _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
- weights->buffer() + weights->info()->offset_first_element_in_bytes(),
- weights_row_stride,
- weights_col_stride,
- (bias != nullptr) ? bias->buffer() : nullptr);
- _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
-
- weights->mark_as_unused();
- if(bias != nullptr)
- {
- bias->mark_as_unused();
- }
- _pImpl->is_prepared = true;
- }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
deleted file mode 100644
index 195942b7fd..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
-
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Depthwise convolution assembly kernel glue */
-class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
-{
-public:
- /** Default constructor */
- CpuDepthwiseConv2dAssemblyDispatch();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
- /** Default destructor */
- ~CpuDepthwiseConv2dAssemblyDispatch();
-
- /** Initialize the function's source, destination, kernels and border_size.
- *
- * @note Supports only NHWC format
- *
- * @param[in] src Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src.
- * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src.
- * @param[out] dst Destination tensor info. Data type supported: same as @p src.
- * @param[in] info Depthwise convolution meta-data.
- */
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
- /** Check if the optimized kernel can be used for the given kernel sizes and strides
- *
- * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
- *
- * @param[in] src Input tensor info.
- * @param[in] weights Weights tensor info.
- * @param[in] info Depthwise convolution meta-data.
- *
- * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
- */
- static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- struct LocalImpl;
- std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDequantize.h b/src/runtime/cpu/operators/CpuDequantize.h
deleted file mode 100644
index d1fb9e8d0e..0000000000
--- a/src/runtime/cpu/operators/CpuDequantize.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H
-#define ARM_COMPUTE_CPU_DEQUANTIZE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */
-class CpuDequantize : public ICpuOperator
-{
-public:
- /** Default Constructor */
- CpuDequantize() = default;
- /** Configure the kernel.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuDequantize::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
deleted file mode 100644
index 8812b777a3..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConv2d.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuDirectConv2d::~CpuDirectConv2d() = default;
-
-CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
- _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
-{
-}
-
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- _output_stage_kernel = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
- _conv_kernel = std::make_unique<kernels::CpuDirectConv2dKernel>();
- _input_border_handler = std::make_unique<NEFillBorderKernel>();
-
- // Free accumulator
- if(_accumulator.buffer() != nullptr)
- {
- _accumulator.allocator()->free();
- }
-
- _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
- // Check if bias should be added in the convolution result
- _has_bias = (bias != nullptr);
-
- _conv_kernel->configure(src, weights, dst, conv_info);
- if(_has_bias)
- {
- _output_stage_kernel->configure(dst, bias);
- }
- _is_padding_required = !_conv_kernel->border_size().empty();
-
- if(_is_padding_required)
- {
- // Add zero padding XY
- _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
- }
-
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function = std::make_unique<CpuActivation>();
- _activationlayer_function->configure(dst, dst, act_info);
- }
-}
-
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-
- // output might not be initialized since it can be an intermediate tensor of another layer
- DataType data_type = src->data_type();
- TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
- // Validate Convolution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
- "Biases size and number of input feature maps should match");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
- }
-
- // Validate bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
-
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
- }
-
- return Status{};
-}
-
-void CpuDirectConv2d::run(ITensorPack &tensors)
-{
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
- auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- if(_is_padding_required)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC_DST, src);
- NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
- }
- NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
- if(_has_bias)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC_0, dst);
- pack.add_tensor(TensorType::ACL_SRC_1, bias);
- pack.add_tensor(TensorType::ACL_DST, dst);
- NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
- }
-
- if(_is_activationlayer_enabled)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, dst);
- pack.add_tensor(TensorType::ACL_DST, dst);
- _activationlayer_function->run(pack);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h
deleted file mode 100644
index 9e584b9c49..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConv2d.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to run the direct convolution.
- *
- * This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel for the input
- * -# @ref kernels::CpuDirectConv2dOutputStageKernel
- * -# @ref kernels::CpuDirectConv2dKernel
- */
-class CpuDirectConv2d : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Destructor */
- ~CpuDirectConv2d();
- /** Set the input, weights, biases and output tensors.
- *
- * @note: DirectConvolution only works in the following configurations:
- * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
- * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
- * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
- *
- * @param[in, out] src Input tensor info. Data types supported: F16/F32.
- * @param[in] weights Set of kernels to convolve the input volume.
- * Supported sizes: 1x1, 3x3 and 5x5.
- * The 3rd dimension must be the same as the input's volume 3rd dimension.
- * Data type supported: Same as @p src.
- * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src.
- * @param[out] dst Output tensor info.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDirectConv2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-
-private:
- MemoryGroup _memory_group;
- std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
- std::unique_ptr<kernels::CpuDirectConv2dKernel> _conv_kernel;
- std::unique_ptr<NEFillBorderKernel> _input_border_handler;
- std::unique_ptr<CpuActivation> _activationlayer_function;
- Tensor _accumulator;
- bool _has_bias{ false };
- bool _is_activationlayer_enabled{ false };
- unsigned int _dim_split{ 0 };
- bool _is_padding_required{ false };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp
deleted file mode 100644
index 8953d4769c..0000000000
--- a/src/runtime/cpu/operators/CpuElementwise.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuElementwise.h"
-#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuElementwiseBase::run(ITensorPack &tensors)
-{
- // If the kernel has been configured, use the window from the kernel.
- if(_kernel->is_window_configured())
- {
- ICpuOperator::run(tensors);
- return;
- }
-
- auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info();
- auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info();
- auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape());
- ICpuOperator::run(tensors, shape_and_window.second);
-}
-
-template <ArithmeticOperation op>
-void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuArithmeticKernel>();
- k->configure(op, src0, src1, dst);
- _kernel = std::move(k);
-}
-
-template <ArithmeticOperation op>
-Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst);
-}
-
-template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
-
-void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuDivisionKernel>();
- k->configure(src0, src1, dst);
- _kernel = std::move(k);
-}
-
-Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- return kernels::CpuDivisionKernel::validate(src0, src1, dst);
-}
-
-void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuPowerKernel>();
- k->configure(src0, src1, dst);
- _kernel = std::move(k);
-}
-
-Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- return kernels::CpuPowerKernel::validate(src0, src1, dst);
-}
-
-template <ComparisonOperation COP>
-void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuComparisonKernel>();
- k->configure(COP, src0, src1, dst);
- _kernel = std::move(k);
-}
-
-template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
-}
-
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
-{
- auto k = std::make_unique<kernels::CpuComparisonKernel>();
- k->configure(op, src0, src1, dst);
- _kernel = std::move(k);
-}
-
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
-{
- return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
-}
-
-// Supported Specializations
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h
deleted file mode 100644
index 899a2ffdb7..0000000000
--- a/src/runtime/cpu/operators/CpuElementwise.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuElementwiseBase : public ICpuOperator
-{
-public:
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power
- *
- * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32
- * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32.
- */
-template <ArithmeticOperation op>
-class CpuElementwiseArithmetic : public CpuElementwiseBase
-{
-public:
- /** Configure the operator
- *
- * @param[in] src0 The first source tensor information.
- * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor.
- * @param[out] dst The output tensor information.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if the given information will lead to a valid configuration
- *
- * @param[in] src0 The first source tensor information.
- * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor.
- * @param[out] dst The output tensor information.
- *
- * @return A status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */
-using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */
-using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */
-using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
-
-/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
- *
- * @note The tensor data type for the inputs must be S32/F16/F32.
- * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
- */
-class CpuElementwiseDivision : public CpuElementwiseBase
-{
-public:
- /** Initialise the kernel's inputs, dst and conversion policy.
- *
- * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32.
- * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
- *
- * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
- * @note For an exponent that is a float, this function will only work with a positive base.
- */
-class CpuElementwisePower : public CpuElementwiseBase
-{
-public:
- /** Initialise the kernel's inputs, dst and conversion policy.
- *
- * @param[in, out] src0 First tensor input info. Data types supported: F16/F32.
- * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
- *
- * @param[in] src0 First tensor input info. Data types supported: F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-class CpuElementwiseComparison : public CpuElementwiseBase
-{
-public:
- /** Initialise the kernel's inputs, dst and conversion policy.
- *
- * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: U16/U32.
- * @param[in] op Comparison Operation to be performed.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
- *
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: U16/U32.
- * @param[in] op Comparison Operation to be performed.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-template <ComparisonOperation op>
-class CpuElementwiseComparisonStatic : public CpuElementwiseBase
-{
-public:
- /** Initialise the kernel's inputs, dst and conversion policy.
- *
- * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: U16/U32.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
- *
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: U16/U32.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run equal comparison. */
-using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
-/** Basic function to run not equal comparison. */
-using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-/** Basic function to run greater comparison. */
-using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
-/** Basic function to run greater-equal comparison. */
-using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-/** Basic function to run less comparison. */
-using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
-/** Basic function to run less-equal comparison. */
-using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
deleted file mode 100644
index c79e6e9acf..0000000000
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
-#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using KernelType = kernels::CpuElementwiseUnaryKernel;
-
-void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
-{
- auto k = std::make_unique<KernelType>();
- k->configure(op, src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
-{
- return KernelType::validate(op, src, dst);
-}
-
-void CpuElementwiseUnary::run(ITensorPack &tensors)
-{
- if(_kernel->is_window_configured())
- {
- ICpuOperator::run(tensors);
- return;
- }
-
- auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info();
- ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.h b/src/runtime/cpu/operators/CpuElementwiseUnary.h
deleted file mode 100644
index 721ba2a85b..0000000000
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
-
-#include "arm_compute/core/Types.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuElementwiseUnary : public ICpuOperator
-{
-public:
- /** Initialize the function
- *
- * @param[in] op Unary operation to execute
- * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
- * @param[out] dst Output tensor information. Data types supported: Same as @p src.
- */
- void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * @param[in] op Unary operation to execute
- * @param[in] src First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
- * @param[in] dst Output tensor info. Data types supported: Same as @p input.
- *
- * @return a status
- */
- static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuFill.cpp b/src/runtime/cpu/operators/CpuFill.cpp
deleted file mode 100644
index 081e30ea17..0000000000
--- a/src/runtime/cpu/operators/CpuFill.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFill.h"
-
-#include "src/core/cpu/kernels/CpuFillKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value)
-{
- auto k = std::make_unique<kernels::CpuFillKernel>();
- k->configure(tensor, constant_value);
- _kernel = std::move(k);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFill.h b/src/runtime/cpu/operators/CpuFill.h
deleted file mode 100644
index fac8e76481..0000000000
--- a/src/runtime/cpu/operators/CpuFill.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FILL_H
-#define ARM_COMPUTE_CPU_FILL_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuFillKernel */
-class CpuFill : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuFill() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in,out] tensor Tensor to fill. Supported data types: All
- * @param[in] constant_value The value used to fill the planes of the tensor
- */
- void configure(const ITensorInfo *tensor, PixelValue constant_value);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FILL_H */
diff --git a/src/runtime/cpu/operators/CpuFlatten.cpp b/src/runtime/cpu/operators/CpuFlatten.cpp
deleted file mode 100644
index 58e6e4b671..0000000000
--- a/src/runtime/cpu/operators/CpuFlatten.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFlatten.h"
-
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuReshapeKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuReshapeKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFlatten.h b/src/runtime/cpu/operators/CpuFlatten.h
deleted file mode 100644
index ae71453988..0000000000
--- a/src/runtime/cpu/operators/CpuFlatten.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLATTEN_H
-#define ARM_COMPUTE_CPU_FLATTEN_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to flatten a given input */
-class CpuFlatten : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuFlatten() = default;
- /** Configure operator for a given list of arguments
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src |dst |
- * |:--------------|:--------------|
- * |All |All |
- *
- * @param[in] src Source tensor to flatten with at least 3 dimensions.
- * The dimensions above the third will be interpreted as batches. Data types supported: All
- * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
- * w = width input tensor, h = height input tensor and d = depth input tensor.
- * Data type supported: same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuFlatten::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLATTEN_H */
diff --git a/src/runtime/cpu/operators/CpuFloor.cpp b/src/runtime/cpu/operators/CpuFloor.cpp
deleted file mode 100644
index 4e169a04be..0000000000
--- a/src/runtime/cpu/operators/CpuFloor.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFloor.h"
-
-#include "src/core/cpu/kernels/CpuFloorKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuFloorKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuFloorKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFloor.h b/src/runtime/cpu/operators/CpuFloor.h
deleted file mode 100644
index cbb9d565eb..0000000000
--- a/src/runtime/cpu/operators/CpuFloor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLOOR_H
-#define ARM_COMPUTE_CPU_FLOOR_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuFloorKernel */
-class CpuFloor : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuFloor() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuFloor
- *
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLOOR_H */
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
deleted file mode 100644
index e50099df1f..0000000000
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
-{
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo iqinfo = src->quantization_info();
- const QuantizationInfo wqinfo = weights->quantization_info();
- const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
- const DataType data_type = src->data_type();
- // Merge activation with output stage
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
- if(supported_acts.count(act.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
- }
- GEMMLowpOutputStageInfo os_info;
- os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- os_info.gemmlowp_offset = uoqinfo.offset;
- os_info.gemmlowp_min_bound = min_activation;
- os_info.gemmlowp_max_bound = max_activation;
- os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
- return os_info;
-}
-cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
-{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
- asm_info.ps_info = info.conv_info;
- asm_info.activation_info = info.act_info;
- asm_info.depth_output_gemm3d = true;
- asm_info.reinterpret_input_as_3d = true;
- asm_info.padding_top = info.conv_info.pad_top();
- asm_info.padding_left = info.conv_info.pad_left();
- asm_info.padding_value = 0.f;
- asm_info.negated_offsets = false;
- return asm_info;
-}
-} // namespace
-
-CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>(memory_manager)),
- _activation_func(std::make_unique<CpuActivation>()),
- _weights_permute_func(std::make_unique<CpuPermute>()),
- _permuted_weights_info(),
- _permuted_weights(std::make_unique<Tensor>())
-{
-}
-
-CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
-
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
- weights,
- biases != nullptr ? biases : nullptr,
- dst,
- info));
- _original_weights_info = weights;
- _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 });
-
- // Configure assembly dispatch
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
- if(is_data_type_quantized(src->data_type()))
- {
- asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
- }
- _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info);
-
- // Configure activation
- if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info))
- {
- _activation_func->configure(dst, nullptr, info.act_info);
- _run_activation = true;
- }
-}
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
- const DataType data_type = src->data_type();
- const TensorShape i_shape = src->tensor_shape();
- const TensorShape w_shape = weights->tensor_shape();
- ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
- ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- // Validate biases
- if(biases != nullptr)
- {
- if(is_data_type_quantized_asymmetric(data_type))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else if(data_type == DataType::BFLOAT16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
- return Status{};
-}
-void CpuGemmDirectConv2d::run(ITensorPack &tensors)
-{
- prepare(tensors);
-
- _gemm_asm_func->run(tensors);
- if(_run_activation)
- {
- _activation_func->run(tensors);
- }
-}
-
-void CpuGemmDirectConv2d::allocate_permuted_weights()
-{
- // TODO: This function will be removed when memory injection is implemeted.
- ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr);
- _permuted_weights->allocator()->free();
- _permuted_weights->allocator()->init(_permuted_weights_info);
- _permuted_weights->allocator()->allocate();
-}
-
-void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
-{
- if(!_is_prepared)
- {
- allocate_permuted_weights();
- ITensorPack permute_tensors
- {
- { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
- { TensorType::ACL_DST, _permuted_weights.get() },
- };
-
- _weights_permute_func->run(permute_tensors);
-
- tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused();
-
- // switch the original tensor with permuted tensor
- tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get());
- _is_prepared = true;
- }
-}
-
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
deleted file mode 100644
index 6aa17c2349..0000000000
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct Conv2dInfo;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-class CpuActivation;
-class CpuPermute;
-
-class CpuGemmDirectConv2d : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
- /** Destructor */
- ~CpuGemmDirectConv2d();
- /** Set the input and output tensors.
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src0 |src1 |src2 |dst |
- * |:--------------|:--------------|:--------------|:--------------|
- * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
- * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
- * |F16 |F16 |F16 |F16 |
- * |F32 |F32 |F32 |F32 |
- * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 |
- *
- * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
- */
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
- *
- * Similar to CpuGemmDirectConv2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
-
-private:
- std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
- std::unique_ptr<CpuActivation> _activation_func;
- std::unique_ptr<CpuPermute> _weights_permute_func;
- const ITensorInfo *_original_weights_info{};
- TensorInfo _permuted_weights_info;
- std::unique_ptr<Tensor> _permuted_weights{ nullptr };
- bool _is_prepared{ false };
- bool _run_activation{ false };
-
- /** Function to allocated a tensor for permuted weights
- *
- * @note This function will be removed when memory injection is properly implemented.
- */
- void allocate_permuted_weights();
-};
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
diff --git a/src/runtime/cpu/operators/CpuMul.cpp b/src/runtime/cpu/operators/CpuMul.cpp
deleted file mode 100644
index 2f3d442a70..0000000000
--- a/src/runtime/cpu/operators/CpuMul.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuMul.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuMulKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
-}
-
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<kernels::CpuMulKernel>();
- k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy);
- _kernel = std::move(k);
-}
-
-void CpuMul::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
-}
-
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<kernels::CpuComplexMulKernel>();
- k->configure(src1, src2, dst);
- _kernel = std::move(k);
-}
-
-void CpuComplexMul::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuMul.h b/src/runtime/cpu/operators/CpuMul.h
deleted file mode 100644
index 6e717188a4..0000000000
--- a/src/runtime/cpu/operators/CpuMul.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_MUL_H
-#define ARM_COMPUTE_CPU_MUL_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuMulKernel */
-class CpuMul : public ICpuOperator
-{
-public:
- /** Default Constructor */
- CpuMul() = default;
- /** Initialise the kernel's inputs, dst and convertion policy.
- *
- * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
- * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- *
- * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32).
- * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst dst tensor info. Data types supported:
- * - U8, only if both inputs are U8.
- * - QASYMM8, only if both inputs are QASYMM8.
- * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED.
- * - S16.
- * - QSYMM16, only if both inputs are QSYMM16.
- * - S32, only if both inputs are S32 or both are QSYMM16.
- * - F16, only if @p src1 is F16.
- * - F32, only if both inputs are F32.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
- * @param[in] rounding_policy Rounding policy.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuMul::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-
-/** Basic function to run @ref kernels::CpuComplexMulKernel */
-class CpuComplexMul : public ICpuOperator
-{
-public:
- /** Default Constructor */
- CpuComplexMul() = default;
- /** Initialise the kernel's inputs, dst.
- *
- * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
- * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuComplexMul::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_MUL_H */ \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuPRelu.h b/src/runtime/cpu/operators/CpuPRelu.h
deleted file mode 100644
index a6859f95d9..0000000000
--- a/src/runtime/cpu/operators/CpuPRelu.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PRELU_H
-#define ARM_COMPUTE_CPU_PRELU_H
-
-#include "src/runtime/cpu/operators/CpuElementwise.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */
-using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_PRELU_H */ \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuPermute.cpp b/src/runtime/cpu/operators/CpuPermute.cpp
deleted file mode 100644
index 7fde1e3767..0000000000
--- a/src/runtime/cpu/operators/CpuPermute.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include "src/core/cpu/kernels/CpuPermuteKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
- auto k = std::make_unique<kernels::CpuPermuteKernel>();
- k->configure(src, dst, perm);
- _kernel = std::move(k);
-}
-
-Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
- return kernels::CpuPermuteKernel::validate(src, dst, perm);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPermute.h b/src/runtime/cpu/operators/CpuPermute.h
deleted file mode 100644
index 2b30d7fbd8..0000000000
--- a/src/runtime/cpu/operators/CpuPermute.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PERMUTE_H
-#define ARM_COMPUTE_CPU_PERMUTE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuPermuteKernel */
-class CpuPermute : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuPermute() = default;
- /** Configure operator for a given list of arguments
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[out] dst Destintation tensor. Data types supported: Same as @p src
- * @param[in] perm Permutation vector
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuPermute
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: Same as @p dst
- * @param[in] perm Permutation vector
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_PERMUTE_H */
diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp
deleted file mode 100644
index e746c8fb3b..0000000000
--- a/src/runtime/cpu/operators/CpuPool2d.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPool2d.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/kernels/CpuPool2dKernel.h"
-#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuPool2d::CpuPool2d()
- : _pooling_layer_kernel(),
- _border_handler(),
- _asm_glue(),
- _is_global_pooling_layer(false),
- _data_layout(DataLayout::NCHW),
- _aux_mem(1)
-{
-}
-
-CpuPool2d::~CpuPool2d() = default;
-
-void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
- // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
- // Get data layout
- _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-
- // Check if we have Global Pooling Layer
- const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-
- if(run_optimised)
- {
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
-
- auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
- ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
- pooling_wrapper->configure(src, dst, pool_info, ci);
-
- // Get kernel's memory requirements
- constexpr size_t alignment = 4096;
- const size_t workspace_size = pooling_wrapper->get_working_size(num_threads);
- _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
-
- _asm_glue = std::move(pooling_wrapper);
- }
- else
- {
- // Configure pooling kernel
- auto k = std::make_unique<kernels::CpuPool2dKernel>();
- k->configure(src, dst, pool_info, indices);
- _pooling_layer_kernel = std::move(k);
-
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- {
- // Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
- if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
- {
- zero_value = PixelValue(0, src->data_type(), src->quantization_info());
- }
- auto b = std::make_unique<NEFillBorderKernel>();
- b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
- _border_handler = std::move(b);
- break;
- }
- case DataLayout::NHWC:
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
- }
-}
-
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
- if(run_optimised)
- {
- return Status{};
- }
-
- return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
-}
-
-void CpuPool2d::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
-
- if(_asm_glue)
- {
- const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
- NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
- }
- else
- {
- switch(_data_layout)
- {
- case DataLayout::NCHW:
- // Fill border
- NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
-
- // Run pooling layer
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
- break;
- case DataLayout::NHWC:
- // Run pooling layer
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
- }
-}
-
-experimental::MemoryRequirements CpuPool2d::workspace() const
-{
- return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h
deleted file mode 100644
index 68416b5cfc..0000000000
--- a/src/runtime/cpu/operators/CpuPool2d.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_H
-#define ARM_COMPUTE_CPU_POOL2D_H
-
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward Declarations
-struct PoolingLayerInfo;
-
-namespace cpu
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::CpuPool2dKernel
- * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
- */
-class CpuPool2d : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuPool2d();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
- /** Default destructor */
- ~CpuPool2d();
- /** Set the src and dst tensors.
- *
- * @note F16 is supported for pool sizes 2 and 3 only
- *
- * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
- * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
- */
- void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuPool2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- std::unique_ptr<INEKernel> _pooling_layer_kernel;
- std::unique_ptr<INEKernel> _border_handler;
- std::unique_ptr<INEKernel> _asm_glue;
-
- bool _is_global_pooling_layer;
- DataLayout _data_layout;
- experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/runtime/cpu/operators/CpuQuantize.h b/src/runtime/cpu/operators/CpuQuantize.h
deleted file mode 100644
index 09afffd920..0000000000
--- a/src/runtime/cpu/operators/CpuQuantize.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_H
-#define ARM_COMPUTE_CPU_QUANTIZE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */
-class CpuQuantize : public ICpuOperator
-{
-public:
- /** Default Constructor */
- CpuQuantize() = default;
- /** Set the input and output tensors.
- *
- * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuQuantize::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */
diff --git a/src/runtime/cpu/operators/CpuReshape.cpp b/src/runtime/cpu/operators/CpuReshape.cpp
deleted file mode 100644
index 33c9cb87b6..0000000000
--- a/src/runtime/cpu/operators/CpuReshape.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuReshape.h"
-
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuReshapeKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuReshapeKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuReshape.h b/src/runtime/cpu/operators/CpuReshape.h
deleted file mode 100644
index e136043568..0000000000
--- a/src/runtime/cpu/operators/CpuReshape.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_H
-#define ARM_COMPUTE_CPU_RESHAPE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuReshapeKernel */
-class CpuReshape : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuReshape() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] src Source tensor info. Data type supported: All
- * @param[out] dst Destination info. Data type supported: Same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuReshape
- *
- * @param[in] src Source tensor info. Data type supported: All
- * @param[in] dst Destination tensor info. Data type supported: Same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_H */
diff --git a/src/runtime/cpu/operators/CpuScale.cpp b/src/runtime/cpu/operators/CpuScale.cpp
deleted file mode 100644
index 681a15e26c..0000000000
--- a/src/runtime/cpu/operators/CpuScale.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuScale.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuScaleKernel.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
-{
- ARM_COMPUTE_ERROR_ON(offsets == nullptr);
- float sampling_offset = 0.0f;
- if(sampling_policy == SamplingPolicy::CENTER)
- {
- sampling_offset = 0.5f;
- }
-
- Window win;
- win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
- win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
-
- if(dx != nullptr && dy != nullptr)
- {
- // Pre-compute the offset and pixel's distance for BILINEAR interpolation
- Iterator offsets_it(offsets, win);
- Iterator dx_it(dx, win);
- Iterator dy_it(dy, win);
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
- const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
- const int in_xi = std::floor(in_x);
- const int in_yi = std::floor(in_y);
-
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
- *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
- },
- offsets_it, dx_it, dy_it);
- }
- else
- {
- // Pre-compute the offset for NEAREST interpolation
- Iterator offsets_it(offsets, win);
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float float_in_xi = (id.x() + sampling_offset) * wr;
- const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- },
- offsets_it);
- }
-}
-} // namespace
-
-CpuScale::CpuScale()
- : _scale_info(InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED), _data_layout(DataLayout::UNKNOWN), _is_prepared(false)
-{
-}
-
-void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info));
-
- _scale_info = info;
-
- // Get data layout and width/height indices
- _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
-
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
-
- // Get the tensor shape
- TensorShape shape(dst->dimension(idx_width));
- shape.set(1, dst->dimension(idx_height), false);
-
- TensorInfo tensor_info_offsets(shape, Format::S32);
- TensorInfo tensor_info_dxdy(shape, Format::F32);
-
- auto dx = std::make_unique<TensorInfo>(tensor_info_dxdy);
- auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy);
- auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets);
- auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
- switch(policy_to_use)
- {
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- {
- scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info);
- break;
- }
- case InterpolationPolicy::BILINEAR:
- {
- scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info);
- break;
- }
- case InterpolationPolicy::AREA:
- {
- scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported interpolation mode");
- }
- _kernel = std::move(scale_kernel);
-}
-
-Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-
- ITensorInfo *offsets = nullptr;
- ITensorInfo *dx = nullptr;
- ITensorInfo *dy = nullptr;
-
- // Get data layout and width/height indices
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
-
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
-
- // Get the tensor shape of auxilary buffers
- const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
- TensorInfo tensor_info_offsets(shape, Format::S32);
- TensorInfo tensor_info_dx(shape, Format::F32);
- TensorInfo tensor_info_dy(shape, Format::F32);
- switch(policy_to_use)
- {
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- offsets = &tensor_info_offsets;
- break;
- case InterpolationPolicy::BILINEAR:
- offsets = &tensor_info_offsets;
- dx = &tensor_info_dx;
- dy = &tensor_info_dy;
- break;
- default:
- break;
- }
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
- return Status{};
-}
-
-void CpuScale::prepare(ITensorPack &tensors)
-{
- if(!_is_prepared)
- {
- _is_prepared = true;
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- auto dx = tensors.get_tensor(TensorType::ACL_INT_0);
- auto dy = tensors.get_tensor(TensorType::ACL_INT_1);
- auto offsets = tensors.get_tensor(TensorType::ACL_INT_2);
-
- // Get data layout and width/height indices
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
-
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
- const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
-
- switch(policy_to_use)
- {
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- {
- // Pre-compute offsets for nearest interpolation
- precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used);
- break;
- }
- case InterpolationPolicy::BILINEAR:
- {
- // Pre-compute dx, dy and offsets for bilinear interpolation
- precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used);
- break;
- }
- case InterpolationPolicy::AREA:
- {
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported interpolation mode");
- }
- }
-}
-
-void CpuScale::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- prepare(tensors);
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuScale.h b/src/runtime/cpu/operators/CpuScale.h
deleted file mode 100644
index 90248a8d59..0000000000
--- a/src/runtime/cpu/operators/CpuScale.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SCALE_H
-#define ARM_COMPUTE_CPU_SCALE_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to compute Scale */
-class CpuScale : public ICpuOperator
-{
-public:
- /** Default Constructor */
- CpuScale();
- /** Initialize the function's source, destination, interpolation type and border_mode.
- *
- * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
- * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo to be used for configuration
- */
- void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref NEScale
- *
- * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
- * @param[in] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo to be used for validation
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
- // Inherited methods overridden:
- void prepare(ITensorPack &tensors) override;
- void run(ITensorPack &tensors) override;
-
-private:
- ScaleKernelInfo _scale_info;
- DataLayout _data_layout;
- bool _is_prepared;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_SCALE_H */
diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp
deleted file mode 100644
index e17925ee50..0000000000
--- a/src/runtime/cpu/operators/CpuSoftmax.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuSoftmax.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
- : _permute_input(),
- _permute_output(),
- _max_kernel(),
- _softmax_kernel(),
- _max(),
- _tmp(),
- _input_permuted(),
- _output_permuted(),
- _needs_permute(false),
- _aux_mem(InternalTensorIdx::COUNT)
-{
-}
-
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
-
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
-
- _needs_permute = actual_axis > 0;
-
- if(_needs_permute)
- {
- _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
- }
-
- // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
- // or it is the original input case (2D case)
- const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
-
- // Create intermediate tensors shapes
- TensorShape max_sum_shape = tmp_input->tensor_shape();
- max_sum_shape.set(0, 1);
- const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
- TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
-
- // Init intermediate tensors
- _max = TensorInfo(max_info);
- _tmp = TensorInfo(tensor_info_tmp);
-
- // Configure kernels
- auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
- mk->configure(tmp_input, &_max);
- _max_kernel = std::move(mk);
-
- auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
- if(_needs_permute)
- {
- // The normalization kernel stores the result in a permuted output tensor
- sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
-
- // Re-permute the permuted output into the requested (4D) output
- _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
- }
- else
- {
- // Softmax 2D case
- sm->configure(tmp_input, &_max, dst, beta, &_tmp);
- }
- _softmax_kernel = std::move(sm);
-
- _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
- _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
-
- _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
-}
-
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
-{
- // Perform validation step
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
-
- // Create intermediate tensor info
- DataType tmp_data_type = src->data_type();
- const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
- TensorShape max_sum_shape = src->tensor_shape();
- max_sum_shape.set(0, 1);
- const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
- const TensorInfo dont_care;
-
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
-
- const bool needs_permute = actual_axis > 0;
-
- if(needs_permute)
- {
- const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
- TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
- TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
- }
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
-
- return Status{};
-}
-
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, false);
- CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, false);
-
- CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, false);
- CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, false);
-
- ITensorPack max_pack;
- ITensorPack softmax_pack;
-
- if(_needs_permute)
- {
- ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
- _permute_input.run(permute_in_pack);
-
- max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
-
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, input_permuted.get() },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, output_permuted.get() },
- { TensorType::ACL_DST_1, tmp.get() }
- };
- }
- else
- {
- max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, src },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, dst },
- { TensorType::ACL_DST_1, tmp.get() }
- };
- }
-
- NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
- NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
-
- if(_needs_permute)
- {
- ITensorPack permute_out_pack;
- permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
- permute_out_pack.add_tensor(TensorType::ACL_DST, dst);
- _permute_output.run(permute_out_pack);
- }
-}
-
-template <bool IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
-{
- return _aux_mem;
-}
-
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h
deleted file mode 100644
index 38817977b3..0000000000
--- a/src/runtime/cpu/operators/CpuSoftmax.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
-
-/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
- *
- * Softmax is calculated by :
- * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
- *
- * Log Softmax is calculated by :
- * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
- *
- * This function runs the following function/kernels:
- * -# If axis is not 0:
- * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
- */
-template <bool IS_LOG = false>
-class CpuSoftmaxGeneric : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuSoftmaxGeneric();
- /** Set the input and output tensors.
- *
- * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * last value of each row to the nearest multiple.
- * @param[out] dst Destination tensor ifo. Data types supported: same as @p input.
- * @param[in] beta (Optional) A scaling factor for the exponent.
- * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
- * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuSoftmax
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p input
- * @param[in] beta (Optional) A scaling factor for the exponent.
- * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
- * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- enum InternalTensorIdx
- {
- MAX = 0,
- TMP,
- PERMUTED_SRC,
- PERMUTED_DST,
- COUNT
- };
-
- CpuPermute _permute_input;
- CpuPermute _permute_output;
- std::unique_ptr<ICpuKernel> _max_kernel;
- std::unique_ptr<ICpuKernel> _softmax_kernel;
-
- TensorInfo _max;
- TensorInfo _tmp;
- TensorInfo _input_permuted;
- TensorInfo _output_permuted;
-
- bool _needs_permute;
- experimental::MemoryRequirements _aux_mem{};
-};
-using CpuSoftmax = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
-
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
diff --git a/src/runtime/cpu/operators/CpuSub.cpp b/src/runtime/cpu/operators/CpuSub.cpp
deleted file mode 100644
index 9baaaa9d67..0000000000
--- a/src/runtime/cpu/operators/CpuSub.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuSub.h"
-
-#include "src/core/cpu/kernels/CpuSubKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<kernels::CpuSubKernel>();
- k->configure(src0, src1, dst, policy);
- _kernel = std::move(k);
-}
-
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuSub.h b/src/runtime/cpu/operators/CpuSub.h
deleted file mode 100644
index 099ffef87e..0000000000
--- a/src/runtime/cpu/operators/CpuSub.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SUB_H
-#define ARM_COMPUTE_CPU_SUB_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuSubKernel */
-class CpuSub : public ICpuOperator
-{
-public:
- /** Initialise the kernel's inputs, dst and conversion policy.
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (QASYMM8, QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- *
- * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CpuSub
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (QASYMM8, QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- *
- * @param[in] src0 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
- * @param[in] src1 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
- * @param[in] dst Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
- * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SUB_H */ \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuTranspose.cpp b/src/runtime/cpu/operators/CpuTranspose.cpp
deleted file mode 100644
index 51eeb90b8b..0000000000
--- a/src/runtime/cpu/operators/CpuTranspose.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuTranspose.h"
-
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::CpuTransposeKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
-
-Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuTransposeKernel::validate(src, dst);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuTranspose.h b/src/runtime/cpu/operators/CpuTranspose.h
deleted file mode 100644
index c0232ddab2..0000000000
--- a/src/runtime/cpu/operators/CpuTranspose.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H
-#define ARM_COMPUTE_CPU_TRANSPOSE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuTransposeKernel */
-class CpuTranspose : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuTranspose() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[out] dst Destintation tensor. Data types supported: Same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuTranspose
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: Same as @p dst
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
deleted file mode 100644
index ea3742fee5..0000000000
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ /dev/null
@@ -1,869 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
-#include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
-
-#include <arm_neon.h>
-#include <cstdlib>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-struct free_delete
-{
- void operator()(void *x)
- {
- free(x);
- }
-};
-
-struct Params
-{
- unsigned int M;
- unsigned int N;
- unsigned int K;
- unsigned int batches;
- unsigned int multis;
- unsigned int sections;
- bool indirect;
-};
-
-Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
- Params p;
- p.M = d->tensor_shape().y();
- p.K = a->tensor_shape().x();
- p.N = d->tensor_shape().x();
- p.batches = 1;
- p.multis = 1;
- p.sections = 1;
- p.indirect = false;
-
- if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
- {
- p.indirect = true;
- p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
- }
- else
- {
- p.multis = b->tensor_shape().z();
- p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
- }
-
- // Update M in case of GEMM3D for output
- if(info.depth_output_gemm3d != 0)
- {
- p.M = d->tensor_shape().y() * d->tensor_shape().z();
- p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
- }
-
- return p;
-}
-
-arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
-{
- arm_gemm::Activation gemm_act;
-
- // Early exit in case lower bound is other than 0, as it's not yet supported
- if(act.b() != 0.f)
- {
- return gemm_act;
- }
-
- switch(act.activation())
- {
- case ActivationLayerInfo::ActivationFunction::RELU:
- gemm_act.type = arm_gemm::Activation::Type::ReLU;
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
- gemm_act.param1 = act.a();
- gemm_act.param2 = 0.f;
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
- gemm_act.param1 = act.a();
- gemm_act.param2 = act.b();
- break;
- default:
- gemm_act.type = arm_gemm::Activation::Type::None;
- }
-
- return gemm_act;
-}
-
-IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
-{
- // Schedule assembly kernel
- const int granule_threshold = 200;
- IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
- if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
- {
- scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
- }
- else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
- {
- //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
- }
- else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
- {
- //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
- }
-
- return scheduling_hint;
-}
-
-template <typename TypeInput, typename TypeOutput>
-class FallbackTransform : public ITransformWeights
-{
-public:
- FallbackTransform() noexcept {};
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- FallbackTransform(const FallbackTransform &) = delete;
- /** Default move constructor */
- FallbackTransform(FallbackTransform &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- FallbackTransform &operator=(const FallbackTransform &) = delete;
- /** Default move assignment operator */
- FallbackTransform &operator=(FallbackTransform &&) = default;
- void run() override
- {
- _output.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr);
- _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
- _reshape_run = true;
- }
-
- void release() override
- {
- _output.allocator()->free();
- }
-
- ITensor *get_weights() override
- {
- return &_output;
- }
-
- uint32_t uid() override
- {
- uint32_t id = (_B_pretranspose_size | 0x80000000);
- return id;
- }
-
- void configure(size_t B_pretranspose_size, unsigned int alignment)
- {
- _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
- _B_pretranspose_size = B_pretranspose_size;
- }
-
- void set_pretranspose(ITensor *tensor)
- {
- if(!_reshape_run)
- {
- _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
- }
- }
-
- void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
- {
- _ldb = ldb;
- _in1_ptr = in1_ptr;
- _multi_stride_b = multi_stride_b;
- _gemm_kernel_asm = gemm_kernel_asm;
- }
-
-private:
- Tensor _output{};
- int _ldb{};
- const TypeInput *_in1_ptr{};
- int _multi_stride_b{};
- size_t _B_pretranspose_size{};
- std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-};
-
-/** Fallback in case ACL doesn't have a function */
-template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
-class Fallback : public CpuGemmAssemblyDispatch::IFallback
-{
-public:
- /** Destructor */
- ~Fallback()
- {
- if(_pretranspose && !(is_weight_managed()))
- {
- delete _pretranspose;
- }
- }
-
- /** Initialise the functions's input and output.
- *
- * @param[in] a Input tensor containing the Matrix A.
- * @param[in] b Input tensor containing the Matrix B.
- * @param[in] c Input tensor containing the Matrix C.
- * @param[out] d Output tensor to store the result of matrix multiplication.
- * @param[in] args Matrix multiplication information.
- * @param[in] gemm_info GEMM meta-data
- * @param[in] memory_group Memory group to be used by the function.
- * @param[in] weights_manager Weights manager to be used by the function.
- * @param[in] os Output stage meta-data.
- */
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
- MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
-
- /** Set requantization shifts to be used
- *
- * @param[in] shifts Requantization shifts
- *
- * @return Pointer to the shift data
- */
- /** Set requantization data to be used
- *
- *
- * @param shifts Requantization shifts
- * @param multipliers Requantization multipliers
- *
- * @return A tuple with the pointers to the shift and multiplier data respectively
- */
- std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
- const std::vector<int32_t> &multipliers);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
- bool is_configured() const override;
-
-private:
- /** Allocate a workspace tensor.
- *
- * @param[in] workspace_size Size to allocate.
- * @param[in] memory_group Tensor memory group.
- * @param[in] alignment Workspace memory alignment.
- */
- void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
- /** Configure the indirect buffer
- *
- * @param[in] a Input tensor containing the Matrix A.
- * @param[in] b Input tensor containing the Matrix B.
- * @param[out] d Output tensor to store the result of matrix multiplication.
- * @param[in] info GEMM meta-data
- */
- void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
- /** Prepare the indirect buffer */
- void prepare_indirect_buffer(ITensorPack &tensors);
-
- /** Assembly Gemm kernel */
- std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
- /** Optimised Arm® Neon™ kernel */
- std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
- /** GEMM workspace */
- Tensor _workspace{};
- /** Pre-transpose tensor */
- ITensor *_pretranspose{ nullptr };
- /** Prepared flag */
- bool _is_prepared{ false };
- /** GEMM meta-data */
- AsmGemmInfo _gemm_info{};
- /** Weights manager */
- IWeightsManager *_weights_manager{ nullptr };
- /** Weights transform object */
- FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
- /** GEMM kernel description */
- arm_gemm::KernelDescription _kernel_info{};
- /** Per channel quantization shifts */
- std::vector<int32_t> _shifts{};
- std::vector<int32_t> right_shifts{};
- std::vector<int32_t> left_shifts{};
- /** Per channel quantization multipliers */
- std::vector<int32_t> _multipliers{};
- /** Indirect buffer */
- std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
- std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
- std::vector<TypeInput> _indirect_pad{};
- arm_gemm::ConvolutionParameters _cp{};
-
- bool is_weight_managed()
- {
- // TODO (COMPMID-4539): This function should do the following:
- // _weights_manager && _weights_manager->are_weights_managed(_b)
- // , where _b is the second Tensor that is used to be given to the configure().
- // Currently, however, weight manager is disabled to make this class stateless.
- // This should be revisited in the future.
- return false;
- }
-
- void acquire_managed_weight()
- {
- // TODO (COMPMID-4539): This function should do the following:
- // _pretranspose = _weights_manager->acquire(_b, &_weights_transform);
- // , where _b is the second Tensor that is used to be given to the configure().
- // Currently, however, weight manager is disabled to make this class stateless.
- _pretranspose = nullptr;
- }
-};
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
-{
- _multipliers = multipliers;
- _shifts = shifts;
- bool need_left = false;
- for(const auto s : _shifts)
- {
- left_shifts.push_back(std::max(-s, int32_t(0)));
- right_shifts.push_back(std::min(-s, int32_t(0)));
- if(s < 0 && !need_left)
- {
- need_left = true;
- }
- }
- return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
-{
- auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer());
- const int multis = 1;
- const int batches = a->info()->tensor_shape().total_size_upper(3);
- const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
- const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
- const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
-
- const size_t output_hw = _cp.output_height * _cp.output_width;
- const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
- const size_t batch_stride = batch_size / sizeof(TypeInput);
- const int multi_size = batch_size * batches;
- const size_t multi_stride = multi_size / sizeof(TypeInput);
-
- for(int64_t m = 0; m < multis; m++)
- {
- for(int64_t b = 0; b < batches; b++)
- {
- for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
- {
- for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
- {
- int64_t output_xy = (output_y * _cp.output_width) + output_x;
-
- for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
- {
- for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
- {
- int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
- int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
- int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
- int64_t input_xy = (input_y * _cp.input_width) + input_x;
-
- if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
- {
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
- }
- else
- {
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
- A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
- }
- }
- }
- }
- }
- }
- }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
-{
- ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
-
- float zeropad = 0.f;
- if(is_data_type_quantized(a->data_type()))
- {
- zeropad = a->quantization_info().uniform().offset;
- }
-
- const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]);
- const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]);
- const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
- const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]);
- const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]);
- const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
- const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
-
- _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
- info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
- };
-
- if(info.method == AsmConvMethod::Conv)
- {
- _gemm_kernel_asm->set_convolution_parameters(_cp);
- }
-
- if(info.method == AsmConvMethod::Indirect)
- {
- const unsigned int multis = 1;
- const unsigned int batches = a->tensor_shape().total_size_upper(3);
- const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
- const unsigned int output_hw = _cp.output_width * _cp.output_height;
-
- using TypeInputPtr = TypeInput *;
- const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr);
- const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
- const int multi_size = batch_size * batches;
- const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
-
- _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
- _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
- _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
-
- // Set indirect argument
- int64_t pos = 0;
- for(int64_t m = 0; m < multis; m++)
- {
- for(int64_t b = 0; b < batches; b++)
- {
- for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
- {
- (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
- }
- }
- }
-
- _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
- }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
- MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
-{
- ARM_COMPUTE_UNUSED(c);
- arm_gemm::GemmConfig gemm_cfg;
- _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
- _weights_manager = weights_manager;
- if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
- {
- gemm_cfg.filter = _kernel_info.name;
- args._cfg = &gemm_cfg;
- }
- _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
- if(_gemm_kernel_asm == nullptr)
- {
- //configuration not supported: Leave function unconfigured:
- return;
- }
-
- // arm_compute wrapper for the Gemm object (see above)
- auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
- ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
- acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
- const size_t workspace_size = _gemm_kernel_asm->get_working_size();
- if(workspace_size > 0)
- {
- // Allocate workspace
- const unsigned int alignment = 4096;
- allocate_workspace(workspace_size, memory_group, alignment);
- }
-
- //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
- //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
- {
- const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
- if(window_size < static_cast<unsigned int>(args._maxthreads))
- {
- _gemm_kernel_asm->set_nthreads(window_size);
- }
- }
-
- _optimised_kernel = std::move(acl_gemm_wrapper);
- _gemm_info = gemm_info;
- // Check for pre-transposed support
- if(_gemm_kernel_asm->B_pretranspose_required())
- {
- // Forcing 128-byte alignment (required by 32-bit kernels)
- const unsigned int alignment = 128;
- const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
- if(is_weight_managed())
- {
- _weights_transform.configure(B_pretranspose_size, alignment);
- acquire_managed_weight();
- }
- else
- {
- _pretranspose = new Tensor();
- static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
- }
- }
-
- // Handle indirect GEMM convolution
- if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
- {
- configure_indirect(a, b, d, gemm_info);
- }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
-{
- auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- if(!_is_prepared)
- {
- // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
- if(c && c->info()->data_type() == DataType::S32)
- {
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
- }
-
- // Pretranspose B if required
- if(_gemm_kernel_asm->B_pretranspose_required())
- {
- const int ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
- const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-
- if(is_weight_managed())
- {
- _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
- _weights_manager->run(b, &_weights_transform);
-
- // If we didn't run the reshape function, set the pretransposed buffer
- if(!_weights_transform.is_reshape_run())
- {
- _weights_transform.set_pretranspose(_pretranspose);
- }
- }
- else
- {
- static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
- ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
- _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
- b->mark_as_unused();
- }
- }
-
- if(_gemm_info.method == AsmConvMethod::Indirect)
- {
- prepare_indirect_buffer(tensors);
- }
-
- _is_prepared = true;
- }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
-{
- ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
- memory_group.manage(&_workspace);
- _workspace.allocator()->allocate();
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
-{
- return _optimised_kernel != nullptr;
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
-{
- auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto d = tensors.get_tensor(TensorType::ACL_DST);
-
- int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
- int ldb = 0;
- const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
-
- const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
- const size_t a_multi_idx = a_batch_idx + 1;
- const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
- const size_t d_multi_idx = d_batch_idx + 1;
-
- int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
- const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
-
- int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
- int multi_stride_b = 0;
- const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
-
- auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
- const TypeInput *in1_ptr = nullptr;
- auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
-
- // Check if B is pre-tranposed and de-reference if not
- if(!_gemm_kernel_asm->B_is_pretransposed())
- {
- ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
- multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
- in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- }
-
- const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
-
- // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
- if(_workspace.buffer() != nullptr)
- {
- _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
- const unsigned int split_dim = scheduling_hint.split_dimension();
- const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
- unsigned int num_threads = NEScheduler::get().num_threads();
- if(window_size < num_threads)
- {
- num_threads = window_size;
- }
- if(split_dim != IScheduler::split_dimensions_all)
- {
- // Make sure the kernel does not expect more threads than we can actually spawn
- const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
- num_threads = std::min(num_iterations, num_threads);
- }
- _gemm_kernel_asm->set_nthreads(num_threads);
- }
-
- // Prepare assembly kernel
- prepare(tensors);
-
- // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
- TypeOutput *bias = nullptr;
- if(c && c->info()->data_type() != DataType::S32)
- {
- bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
- }
-
- if(_gemm_info.method == AsmConvMethod::Indirect)
- {
- in0_ptr = nullptr;
- lda = 0;
- batch_stride_a = 0;
- multi_stride_a = 0;
- }
-
- // Set gemm parameters
- _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
- in1_ptr, ldb, multi_stride_b,
- out_ptr, ldd, batch_stride_d, multi_stride_d,
- bias, 0);
- // Schedule
- NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
- IWeightsManager *weights_manager)
-{
- Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
-
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
-
- // Create arm_gemm fallback
- auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
- fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
- arm_gemm = std::move(fallback);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
- IWeightsManager *weights_manager)
-{
- ARM_COMPUTE_UNUSED(activation);
- Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
-
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
-
- // Create arm_gemm fallback
- auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
-
- // Configure requantization info
- const int32_t negation = info.negated_offsets ? 1 : -1;
- const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
- const int32_t b_offset = -b->quantization_info().uniform().offset * negation;
- const GEMMLowpOutputStageInfo os_info = info.output_stage;
-
- arm_gemm::Requantize32 gemm_requant_info{};
- if(os_info.gemmlowp_shifts.size() > 1)
- {
- const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
- std::get<2>(requantize_data),
- std::get<3>(requantize_data),
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
- }
- else
- {
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
- }
-
- // Configure fallback
- fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
- arm_gemm = std::move(fallback);
-}
-
-} //namespace
-
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
-{
-}
-
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
-{
- ARM_COMPUTE_UNUSED(c, info);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-
-#ifndef __aarch64__
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
-#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(is_data_type_quantized_per_channel(b->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
- return Status{};
-}
-
-bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
-{
- arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
- return act.type != arm_gemm::Activation::Type::None;
-}
-
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
- arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
-
- //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
- {
- return;
- }
-
- switch(a->data_type())
- {
- case DataType::F32:
- create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(d->data_type() == DataType::S32)
- {
- create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- }
- else
- {
- create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- }
- break;
- case DataType::S8:
- case DataType::QASYMM8_SIGNED:
- if(d->data_type() == DataType::S32)
- {
- create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- }
- else
- {
- create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- }
- break;
-#endif /* __aarch64__ */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
- case DataType::BFLOAT16:
- create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- break;
- }
-}
-
-void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
- _arm_gemm->prepare(tensors);
-}
-
-bool CpuGemmAssemblyDispatch::is_configured() const
-{
- return _arm_gemm != nullptr && _arm_gemm->is_configured();
-}
-
-void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
-{
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
- _arm_gemm->run(tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
deleted file mode 100644
index ffc097c75c..0000000000
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/* Convolution method supported by the assembly gemm interface */
-enum class AsmConvMethod
-{
- Im2Col,
- Indirect,
- Conv
-};
-
-struct AsmGemmInfo
-{
- AsmConvMethod method{ AsmConvMethod::Im2Col };
- PadStrideInfo ps_info{};
- ActivationLayerInfo activation_info{};
- GEMMLowpOutputStageInfo output_stage{};
- bool negated_offsets{ true };
- bool reinterpret_input_as_3d{ false };
- bool depth_output_gemm3d{ false };
- int64_t padding_top{ 0 };
- int64_t padding_left{ 0 };
- float padding_value{ 0.f };
-};
-
-/** Assembly kernel glue */
-class CpuGemmAssemblyDispatch : public ICpuOperator
-{
-public:
- /** Constructor */
- CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
- /** Defautl destructor */
- ~CpuGemmAssemblyDispatch() = default;
-
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch);
-
- class IFallback
- {
- public:
- virtual void run(ITensorPack &tensors) = 0;
- virtual void prepare(ITensorPack &tensors) = 0;
- virtual bool is_configured() const = 0;
- virtual ~IFallback() = default;
- };
-
-public:
- /** If supported create a Compute Library function else fallback to the arm_gemm function.
- *
- * @param[in] a Input tensor (Matrix A)
- * @param[in] b Input tensor (Matrix B)
- * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations
- * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
- * @param[in] info GEMM meta-data
- */
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
-
- /** Indicates whether or not this function can be used to process the given parameters.
- *
- * @param[in] a Input tensor info (Matrix A)
- * @param[in] b Input tensor info (Matrix B)
- * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations
- * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
- * @param[in] info GEMM meta-data
- *
- * @return a status.
- */
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
- /** Checks if activation is supported by the gemm assembly dispatcher
- *
- * @param[in] activation Activation to check
- *
- * @return True if activation is supported else false
- */
- static bool is_activation_supported(const ActivationLayerInfo &activation);
- /** Was the function successfully configured ?
- *
- * @return True if the function is configured and ready to run
- */
- bool is_configured() const;
-
- // Inherited methods overridden:
- void prepare(ITensorPack &tensors) override;
- void run(ITensorPack &tensors) override;
-
-private:
- std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */
- MemoryGroup _memory_group; /**< Function memory group */
- IWeightsManager *_weights_manager; /**< Pointer to the weights manager */
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */
diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
deleted file mode 100644
index 644018a718..0000000000
--- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
-class CpuAuxTensorHandler
-{
-public:
- CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
- : _tensor()
- {
- _tensor.allocator()->soft_init(info);
-
- ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
- if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
- {
- _tensor.allocator()->allocate();
- if(pack_inject)
- {
- pack.add_tensor(slot_id, &_tensor);
- _injected_tensor_pack = &pack;
- _injected_slot_id = slot_id;
- }
- }
- else
- {
- _tensor.allocator()->import_memory(packed_tensor->buffer());
- }
- }
-
- CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
- : _tensor()
- {
- _tensor.allocator()->soft_init(info);
- if(info.total_size() <= tensor.info()->total_size())
- {
- _tensor.allocator()->import_memory(tensor.buffer());
- }
- }
-
- CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
- CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
-
- ~CpuAuxTensorHandler()
- {
- if(_injected_tensor_pack)
- {
- _injected_tensor_pack->remove_tensor(_injected_slot_id);
- }
- }
-
- ITensor *get()
- {
- return &_tensor;
- }
-
- ITensor *operator()()
- {
- return &_tensor;
- }
-
-private:
- Tensor _tensor{};
- ITensorPack *_injected_tensor_pack{ nullptr };
- int _injected_slot_id{ TensorType::ACL_UNKNOWN };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/IClOperator.h b/src/runtime/gpu/cl/IClOperator.h
deleted file mode 100644
index 049bf05dc1..0000000000
--- a/src/runtime/gpu/cl/IClOperator.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_OPERATOR_H
-#define ARM_COMPUTE_ICL_OPERATOR_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/runtime/CL/ICLOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using IClOperator = experimental::ICLOperator;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_OPERATOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClActivation.cpp b/src/runtime/gpu/cl/operators/ClActivation.cpp
deleted file mode 100644
index 71aa57bdbd..0000000000
--- a/src/runtime/gpu/cl/operators/ClActivation.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClActivationKernel>();
- k->configure(compile_context, src, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClActivationKernel::validate(src, dst, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClActivation.h b/src/runtime/gpu/cl/operators/ClActivation.h
deleted file mode 100644
index 235b826b87..0000000000
--- a/src/runtime/gpu/cl/operators/ClActivation.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ACTIVATION_H
-#define ARM_COMPUTE_CL_ACTIVATION_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClActivationKernel */
-class ClActivation : public IClOperator
-{
-public:
- /** Constructor */
- ClActivation() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[out] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] activation_info Activation layer parameters.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
- /** Static function to check if given info will lead to a valid configuration of @ref ClActivation
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] act_info Activation layer information.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ACTIVATION_H */
diff --git a/src/runtime/gpu/cl/operators/ClAdd.cpp b/src/runtime/gpu/cl/operators/ClAdd.cpp
deleted file mode 100644
index 01f550f819..0000000000
--- a/src/runtime/gpu/cl/operators/ClAdd.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClAdd.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
- _kernel = std::move(k);
-}
-
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClAdd.h b/src/runtime/gpu/cl/operators/ClAdd.h
deleted file mode 100644
index f751d8dc83..0000000000
--- a/src/runtime/gpu/cl/operators/ClAdd.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ADD_H
-#define ARM_COMPUTE_CL_ADD_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run arithmetic addition
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @note The function performs an arithmetic addition between two tensors.
- */
-class ClAdd : public IClOperator
-{
-public:
- /** Default Constructor */
- ClAdd() = default;
- /** Configure function for a given list of arguments.
- *
- * Valid configurations (src1,src2) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref ClAdd
- *
- * Valid configurations (src1,src2) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ADD_H */
diff --git a/src/runtime/gpu/cl/operators/ClCast.cpp b/src/runtime/gpu/cl/operators/ClCast.cpp
deleted file mode 100644
index 3f54004aa7..0000000000
--- a/src/runtime/gpu/cl/operators/ClCast.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCast.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
- auto k = std::make_unique<kernels::ClCastKernel>();
- k->configure(compile_context, src, dst, policy);
- _kernel = std::move(k);
-}
-
-Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
- return kernels::ClCastKernel::validate(src, dst, policy);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClCast.h b/src/runtime/gpu/cl/operators/ClCast.h
deleted file mode 100644
index 69e028debd..0000000000
--- a/src/runtime/gpu/cl/operators/ClCast.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CAST_H
-#define ARM_COMPUTE_CL_CAST_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCastKernel */
-class ClCast : public IClOperator
-{
-public:
- /** Constructor */
- ClCast() = default;
- /** Configure operator for a given list of arguments
- *
- * @note Input data type must be different than output data type.
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src |dst |
- * |:--------------|:--------------------------------------|
- * |U8 | S8, U16, S16, U32, S32, F16, F32 |
- * |U16 | U8, S8, S16, U32, S32, F16, F32 |
- * |S16 | U8, S8, U16, U32, S32, F16, F32 |
- * |U32 | U8, S8, U16, S16, S32, F16, F32 |
- * |S32 | U8, S8, U16, S16, U32, F16, F32 |
- * |F16 | U8, S8, U16, S16, U32, F32 |
- * |F32 | U8, S8, U16, S16, U32, F16 |
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[in] policy Conversion policy.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClCast::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CAST_H */
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.cpp b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
deleted file mode 100644
index 4385fcfaed..0000000000
--- a/src/runtime/gpu/cl/operators/ClConcatenate.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-ClConcatenate::ClConcatenate()
- : _concat_kernels(),
- _num_inputs(0),
- _axis(Window::DimX)
-{
-}
-
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
-{
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
- _axis = axis;
- _num_inputs = src_vector.size();
-
- TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
- std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
- std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(t);
- return t;
- });
-
- // dst auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
- ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
-
- unsigned int offset = 0;
- switch(_axis)
- {
- case Window::DimX:
- {
- switch(_num_inputs)
- {
- case 2:
- {
- // Configure WidthConcatenate2Tensors kernel
- auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
- kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- case 4:
- {
- // Configure WidthConcatenate4Tensors kernel
- auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
- kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
- _concat_kernels.emplace_back(std::move(kernel));
- break;
- }
- default:
- {
- // Configure generic case WidthConcatenate kernels
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
- kernel->configure(compile_context, src_vector.at(i), offset, dst);
- offset += src_vector.at(i)->dimension(_axis);
- _concat_kernels.emplace_back(std::move(kernel));
- }
- break;
- }
- }
- break;
- }
- case Window::DimY:
- {
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
- kernel->configure(compile_context, src_vector.at(i), offset, dst);
- offset += src_vector.at(i)->dimension(_axis);
- _concat_kernels.emplace_back(std::move(kernel));
- }
- break;
- }
- case Window::DimZ:
- {
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
- kernel->configure(compile_context, src_vector.at(i), offset, dst);
- offset += src_vector.at(i)->dimension(_axis);
- _concat_kernels.emplace_back(std::move(kernel));
- }
- break;
- }
- case 3:
- {
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
- kernel->configure(compile_context, src_vector.at(i), offset, dst);
- offset += src_vector.at(i)->dimension(_axis);
- _concat_kernels.emplace_back(std::move(kernel));
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Axis not supported");
- }
-}
-
-Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
- const unsigned int num_inputs = src_vector.size();
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
- ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
- unsigned int offset = 0;
- switch(axis)
- {
- case Window::DimX:
- {
- switch(num_inputs)
- {
- case 2:
- // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
- break;
- case 4:
- // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
- break;
- default:
- // Validate generic case of WidthConcatenate kernel
- for(const auto &src : src_vector)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
- offset += src->dimension(axis);
- }
- break;
- }
- break;
- }
- case Window::DimY:
- {
- for(const auto &src : src_vector)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
- offset += src->dimension(axis);
- }
- break;
- }
- case Window::DimZ:
- {
- for(const auto &src : src_vector)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
- offset += src->dimension(axis);
- }
- break;
- }
- case 3:
- {
- for(const auto &src : src_vector)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
- offset += src->dimension(axis);
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Axis not supported");
- }
-
- if(dst->total_size() != 0)
- {
- TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
- ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
- }
-
- return Status{};
-}
-
-void ClConcatenate::run(ITensorPack &tensors)
-{
- if(tensors.empty())
- {
- ARM_COMPUTE_ERROR("No inputs provided");
- }
-
- if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
- {
- ARM_COMPUTE_ERROR("Configured with different number of inputs");
- }
-
- if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
- {
- ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
- CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
- }
- else
- {
- int i = 0;
- for(auto &k : _concat_kernels)
- {
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
- pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
- CLScheduler::get().enqueue_op(*k, pack, true);
- ++i;
- }
- }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.h b/src/runtime/gpu/cl/operators/ClConcatenate.h
deleted file mode 100644
index 0d960a605c..0000000000
--- a/src/runtime/gpu/cl/operators/ClConcatenate.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONCATENATE_H
-#define ARM_COMPUTE_CLCONCATENATE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
- * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
- * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
- * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
- */
-class ClConcatenate : public IClOperator
-{
-public:
- /** Default constructor */
- ClConcatenate();
- /** Initialise the kernel's inputs vector and dst.
- *
- * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
- * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
- * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
- *
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in,out] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All
- * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector.
- * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
- */
- void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
- /** Static function to check if given info will lead to a valid configuration of @ref ClConcatenate
- *
- * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
- * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
- * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
- *
- * @param[in] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All
- * @param[in] dst Destination tensor info. Data types supported: same as @p src_vector.
- * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
- *
- * @return a status
- */
- static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-
-private:
- std::vector<std::unique_ptr<IClKernel>> _concat_kernels;
- unsigned int _num_inputs;
- unsigned int _axis;
-};
-} // namespace opencl
-} // namespace arm_comPUTE
-#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
deleted file mode 100644
index 0d2f2925d3..0000000000
--- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
- auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
- k->configure(compile_context, src, dst, original_src_shape, data_layout);
- _kernel = std::move(k);
-}
-
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
- return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h
deleted file mode 100644
index efedc2fcb7..0000000000
--- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H
-#define ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */
-class ClConvertFullyConnectedWeights : public IClOperator
-{
-public:
- /** Constructor */
- ClConvertFullyConnectedWeights() = default;
- /** Initialise the kernel's inputs and outputs
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src The src tensor info. Data types supported: All.
- * @param[in] dst The dst tensor info. Data types supported: Same as @p src
- * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClConvertFullyConnectedWeightsKernel.
- *
- * @param[in] src First tensor src info. Data types supported: All.
- * @param[in] dst Output tensor info. Data types supported: same as @p src.
- * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/src/runtime/gpu/cl/operators/ClCopy.cpp b/src/runtime/gpu/cl/operators/ClCopy.cpp
deleted file mode 100644
index 2bdb1f5ba1..0000000000
--- a/src/runtime/gpu/cl/operators/ClCopy.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCopy.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
-{
- auto k = std::make_unique<kernels::ClCopyKernel>();
- k->configure(compile_context, src, dst, dst_window);
- _kernel = std::move(k);
-}
-
-Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window)
-{
- return kernels::ClCopyKernel::validate(src, dst, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClCopy.h b/src/runtime/gpu/cl/operators/ClCopy.h
deleted file mode 100644
index 0b99676f65..0000000000
--- a/src/runtime/gpu/cl/operators/ClCopy.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COPY_H
-#define ARM_COMPUTE_CL_COPY_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCopyKernel */
-class ClCopy : public IClOperator
-{
-public:
- /** Constructor */
- ClCopy() = default;
- /** Initialise the function's source and destination.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: All.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src.
- * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
- *
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClCopyKernel
- *
- * @param[in] src Source tensor info. Data types supported: All.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src.
- * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/runtime/gpu/cl/operators/ClCrop.cpp b/src/runtime/gpu/cl/operators/ClCrop.cpp
deleted file mode 100644
index 17bb11912f..0000000000
--- a/src/runtime/gpu/cl/operators/ClCrop.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCrop.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCropKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
- Window *dst_window)
-{
- auto k = std::make_unique<kernels::ClCropKernel>();
- k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
- _kernel = std::move(k);
-}
-
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
-{
- return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClCrop.h b/src/runtime/gpu/cl/operators/ClCrop.h
deleted file mode 100644
index acfbf14742..0000000000
--- a/src/runtime/gpu/cl/operators/ClCrop.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COPY_H
-#define ARM_COMPUTE_CL_COPY_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCropKernel */
-class ClCrop : public IClOperator
-{
-public:
- /** Constructor */
- ClCrop() = default;
- /** Initialise the function's source and destination.
- *
- * @note Supported tensor rank: up to 4
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC.
- * @param[out] dst Destination tensor info. Data type supported: F32
- * @param[in] start Coordinates of where to start cropping the image.
- * @param[in] end Coordinates of where to end cropping the image.
- * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src.
- * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
- * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
- Window *dst_window = nullptr);
-
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClCropKernel
- *
- * @note Supported tensor rank: up to 4
- *
- * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC.
- * @param[in] dst Destination tensor info. Data type supported: F32
- * @param[in] start Coordinates of where to start cropping the image.
- * @param[in] end Coordinates of where to end cropping the image.
- * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src.
- * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
- * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
- Window *dst_window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.cpp b/src/runtime/gpu/cl/operators/ClDequantize.cpp
deleted file mode 100644
index 0c1391bb45..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantize.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDequantize.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClDequantizeKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClDequantizeKernel::validate(src, dst);
-}
-
-void ClDequantize::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.h b/src/runtime/gpu/cl/operators/ClDequantize.h
deleted file mode 100644
index 47fad3eeee..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantize.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
-#define ARM_COMPUTE_CL_DEQUANTIZE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
-class ClDequantize : public IClOperator
-{
-public:
- /** Constructor */
- ClDequantize() = default;
- /** Set the input and output tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[out] dst Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClDequantize::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
deleted file mode 100644
index 13ef42a640..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-ITensorPack select_activation_src_dst(ITensorPack &tensors)
-{
- ITensorPack pack;
- pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
- pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
- return pack;
-}
-} // namespace
-
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
- // Configure direct convolution kernel
- const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
- auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
- k->set_target(CLScheduler::get().target());
- k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info);
- _direct_conv_kernel = std::move(k);
-
- // Configure border handler
- PixelValue zero_value(0.f);
- if(is_data_type_quantized_asymmetric(src->data_type()))
- {
- zero_value = PixelValue(0, src->data_type(), src->quantization_info());
- }
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
- _src_border_handler = std::move(b);
-
- // Fused activation is currently supported for NHWC and floating point types
- if(act_info.enabled() && !conv2d_act_info.enabled())
- {
- auto a = std::make_unique<kernels::ClActivationKernel>();
- a->configure(compile_context, dst, dst, act_info);
- _activation_kernel = std::move(a);
- }
-
- // Tune kernels
- CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
-}
-
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target()));
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
- }
- return Status{};
-}
-
-void ClDirectConv2d::run(ITensorPack &tensors)
-{
- // Run border handler
- CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
- // Run direct convolution
- CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
- // Run activation kernel
- if(_activation_kernel)
- {
- auto act_pack = select_activation_src_dst(tensors);
- CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
- }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
deleted file mode 100644
index e069733fab..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConv2d.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
-#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClDirectConv2d
- */
-class ClDirectConv2d : public IClOperator
-{
-public:
- /** Constructor */
- ClDirectConv2d() = default;
- /** Set the src and dst tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of srcs.
- * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
- * Data types supported: Same as @p src.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to ClDirectConv2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
-
-private:
- std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
- std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
- std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp b/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp
deleted file mode 100644
index e5b836a0d8..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h"
-
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
-}
-
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
-}
-
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
-}
-
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
-}
-
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h b/src/runtime/gpu/cl/operators/ClElementwiseOperations.h
deleted file mode 100644
index b9ab1405c8..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs an arithmetic division between two tensors.
- */
-class ClElementwiseDivision : public IClOperator
-{
-public:
- /** Default Constructor */
- ClElementwiseDivision() = default;
- /** Configure function for a given list of arguments.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src1 First source tensor info. Data types supported: F16/F32.
- * @param[in] src2 Second source tensor info. same as @p src1.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref ClElementwiseDivision
- *
- * @param[in] src1 First source tensor info. Data types supported: F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
- * @note The function performs a max operation between two tensors.
- */
-class ClElementwiseMax : public IClOperator
-{
-public:
- /** Default Constructor */
- ClElementwiseMax() = default;
- /** Configure function for a given list of arguments.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max
- *
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
- * @note The function performs a max operation between two tensors.
- */
-class ClElementwiseMin : public IClOperator
-{
-public:
- /** Default Constructor */
- ClElementwiseMin() = default;
- /** Configure function for a given list of arguments.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min
- *
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
- * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
- */
-class ClElementwiseSquaredDiff : public IClOperator
-{
-public:
- /** Default Constructor */
- ClElementwiseSquaredDiff() = default;
- /** Configure function for a given list of arguments.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference
- *
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
- */
-class ClElementwisePower : public IClOperator
-{
-public:
- /** Default Constructor */
- ClElementwisePower() = default;
- /** Configure function for a given list of arguments.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src1 First source tensor info. Data types supported: F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported:F16/F32.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power
- *
- * @param[in] src1 First source tensor info. Data types supported: F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: F16/F32.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp b/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp
deleted file mode 100644
index 7b830a077f..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h"
-
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
- _kernel = std::move(k);
-}
-
-Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT);
-}
-
-void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::EXP);
- _kernel = std::move(k);
-}
-
-Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP);
-}
-
-void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::NEG);
- _kernel = std::move(k);
-}
-
-Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG);
-}
-
-void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::SIN);
- _kernel = std::move(k);
-}
-
-Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN);
-}
-
-void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::ABS);
- _kernel = std::move(k);
-}
-
-Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS);
-}
-
-void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::LOG);
- _kernel = std::move(k);
-}
-
-Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG);
-}
-
-void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::ROUND);
- _kernel = std::move(k);
-}
-
-Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h b/src/runtime/gpu/cl/operators/ClElementwiseUnary.h
deleted file mode 100644
index b40e3e9a3b..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to perform inverse square root on an src tensor. */
-class ClRsqrt : public IClOperator
-{
-public:
- /** Constructor */
- ClRsqrt() = default;
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClRsqrt
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to perform exponential on an src tensor. */
-class ClExp : public IClOperator
-{
-public:
- /** Constructor */
- ClExp() = default;
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClExp
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to negate an src tensor. */
-class ClNeg : public IClOperator
-{
-public:
- /** Constructor */
- ClNeg() = default;
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClNeg
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to calculate sine of an src tensor. */
-class ClSin : public IClOperator
-{
-public:
- /** Constructor */
- ClSin() = default;
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClSin
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to perform elementwise log on an src tensor. */
-class ClLog : public IClOperator
-{
-public:
- /** Constructor */
- ClLog() = default;
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClLog
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to get the absolute value of an src tensor. */
-class ClAbs : public IClOperator
-{
-public:
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClAbs
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to get the round (to the nearest even) value of an src tensor. */
-class ClRound : public IClOperator
-{
-public:
- /** Initialize the function
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClRound
- *
- * @param[in] src First source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */
diff --git a/src/runtime/gpu/cl/operators/ClFill.cpp b/src/runtime/gpu/cl/operators/ClFill.cpp
deleted file mode 100644
index 4d0afaef24..0000000000
--- a/src/runtime/gpu/cl/operators/ClFill.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFill.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClFillKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
-{
- auto k = std::make_unique<kernels::ClFillKernel>();
- k->configure(compile_context, tensor, constant_value, dst_window);
- _kernel = std::move(k);
-}
-
-Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
-{
- return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClFill.h b/src/runtime/gpu/cl/operators/ClFill.h
deleted file mode 100644
index e632d88546..0000000000
--- a/src/runtime/gpu/cl/operators/ClFill.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FILL_H
-#define ARM_COMPUTE_CL_FILL_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClFillKernel */
-class ClFill : public IClOperator
-{
-public:
- /** Constructor */
- ClFill() = default;
- /** Initialise the kernel's tensor and filling value
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in,out] tensor Source tensor info. Supported data types: All.
- * @param[in] constant_value The value used to fill the planes of the tensor
- * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClFillKernel
- *
- * @param[in] tensor Source tensor info. Data types supported: All.
- * @param[in] constant_value The value used to fill the planes of the tensor.
- * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FILL_H */
diff --git a/src/runtime/gpu/cl/operators/ClFlatten.cpp b/src/runtime/gpu/cl/operators/ClFlatten.cpp
deleted file mode 100644
index 060b653dee..0000000000
--- a/src/runtime/gpu/cl/operators/ClFlatten.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFlatten.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClReshapeKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClReshapeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClFlatten.h b/src/runtime/gpu/cl/operators/ClFlatten.h
deleted file mode 100644
index 20ad06ee57..0000000000
--- a/src/runtime/gpu/cl/operators/ClFlatten.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLATTEN_H
-#define ARM_COMPUTE_CL_FLATTEN_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to flatten a given input */
-class ClFlatten : public IClOperator
-{
-public:
- /** Constructor */
- ClFlatten() = default;
- /** Configure operator for a given list of arguments
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src |dst |
- * |:--------------|:--------------|
- * |All |All |
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor to flatten with at least 3 dimensions.
- * The dimensions above the third will be interpreted as batches. Data types supported: All
- * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
- * w = width input tensor, h = height input tensor and d = depth input tensor.
- * Data type supported: same as @p src
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClFlatten::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLATTEN_H */
diff --git a/src/runtime/gpu/cl/operators/ClFloor.cpp b/src/runtime/gpu/cl/operators/ClFloor.cpp
deleted file mode 100644
index 94e77c0c54..0000000000
--- a/src/runtime/gpu/cl/operators/ClFloor.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFloor.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClFloorKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClFloorKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClFloorKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClFloor.h b/src/runtime/gpu/cl/operators/ClFloor.h
deleted file mode 100644
index f54eef9140..0000000000
--- a/src/runtime/gpu/cl/operators/ClFloor.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLOOR_H
-#define ARM_COMPUTE_CL_FLOOR_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClFloorKernel */
-class ClFloor : public IClOperator
-{
-public:
- /** Constructor */
- ClFloor() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- */
- void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref ClFloor
- *
- * @param[in] src Source tensor info. Data types supported: F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLOOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp
deleted file mode 100644
index a80375447d..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemm.cpp
+++ /dev/null
@@ -1,760 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
-#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "support/Cast.h"
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-using namespace arm_compute::experimental;
-using namespace arm_compute::utils::cast;
-using namespace arm_compute::opencl::kernels;
-
-namespace
-{
-inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
-{
- switch(kernel_type)
- {
- case CLGEMMKernelType::NATIVE_V1:
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- case CLGEMMKernelType::RESHAPED_V1:
- case CLGEMMKernelType::RESHAPED:
- {
- return true;
- }
- default:
- {
- return false;
- }
- }
-}
-//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
-{
- if(!constant_weights)
- {
- return CLGEMMKernelType::NATIVE_V1;
- }
-
- auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
- if(bool(gemm_kernel))
- {
- if(validate_gemm_kernel(gemm_kernel.gemm_type))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
- return gemm_kernel.gemm_type;
- }
- }
- gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
- return gemm_kernel.gemm_type;
-}
-// Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
- const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
-{
- // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
- TensorInfo tmp_b_info{};
- // Validate reshape RHS kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
- {
- return false;
- }
- // Validate mm kernel
- gemm_kernel_info.lhs_info = lhs_info;
- gemm_kernel_info.rhs_info = rhs_info;
- gemm_kernel_info.has_pad_y = false;
- if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
- {
- return false;
- }
- gemm_kernel_info.has_pad_y = true;
- if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
- {
- return false;
- }
- return true;
-}
-
-//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
- const ITensorInfo *b,
- const ITensorInfo *c, const ITensorInfo *output)
-{
- auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
- if(config)
- {
- if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
- }
- }
- config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
-}
-
-// Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
- const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
-{
- // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
-
- // Validate reshape LHS kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
- if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
- {
- return false;
- }
-
- // Validate reshape RHS kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
- {
- return false;
- }
- // Validate mm kernel
- gemm_kernel_info.lhs_info = lhs_info;
- gemm_kernel_info.rhs_info = rhs_info;
- if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
- {
- return false;
- }
- return true;
-}
-
-//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
- const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
-{
- auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
- if(config)
- {
- if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
- {
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
- }
- }
- config = auto_heuristics::select_default_gemm_config_reshaped(query);
- ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
- return { config.lhs_info, config.rhs_info };
-}
-} // namespace
-
-ClGemm::ClGemm()
- : _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),
- _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),
- _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
- _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
- _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
- _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
- _tmp_a(),
- _tmp_b(),
- _reshape_b_only_on_first_run(false),
- _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),
- _aux_mem(AuxTensorIdx::Count)
-{
-}
-
-void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
- const GEMMInfo &gemm_info)
-{
- const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Set the target for the kernels
- _mm_kernel->set_target(gpu_target);
-
- GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
-
- // Configure and tune matrix multiply kernel
- _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
- // Tune kernel statically
- CLScheduler::get().tune_kernel_static(*_mm_kernel);
-}
-
-void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
- const GEMMInfo &gemm_info)
-{
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const GPUTarget gpu_target = CLScheduler::get().target();
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
-
- // Set the target for the kernels
- _reshape_lhs_kernel->set_target(gpu_target);
- _mm_kernel->set_target(gpu_target);
-
- if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
- {
- mult_transpose1xW_width = 4;
- mult_interleave4x4_height = 2;
- }
-
- GEMMRHSMatrixInfo rhs_info;
- rhs_info.n0 = 16 / b->element_size();
- rhs_info.k0 = 1;
- rhs_info.h0 = mult_transpose1xW_width;
- rhs_info.interleave = false;
- rhs_info.transpose = false;
-
- GEMMLHSMatrixInfo lhs_info;
- lhs_info.m0 = 4;
- lhs_info.k0 = 4;
- lhs_info.v0 = mult_interleave4x4_height;
- lhs_info.interleave = true;
- lhs_info.transpose = true;
-
- GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
- // Configure interleave kernel
- _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
-
- // Configure transpose kernel
- _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
- // Configure and tune matrix multiply kernel
- _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
- CLScheduler::get().tune_kernel_static(*_mm_kernel);
-
- // Request memory for LHS and RHS reshape matrix
- _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
- _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
- const GEMMInfo &gemm_info)
-{
- DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const GPUTarget gpu_target = CLScheduler::get().target();
- bool broadcast_bias = gemm_info.broadcast_bias();
-
- GEMMKernelInfo kernel_info;
- kernel_info.m = m;
- kernel_info.n = n;
- kernel_info.k = k;
- kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- kernel_info.reinterpret_input_as_3d = false;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = gemm_info.activation_info();
-
- // Set the target for the kernels
- _reshape_lhs_kernel->set_target(gpu_target);
- _mm_kernel->set_target(gpu_target);
-
- GEMMLHSMatrixInfo lhs_info{};
- GEMMRHSMatrixInfo rhs_info{};
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
- c, output, gemm_info.reinterpret_input_as_3d());
-
- _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
- // Configure and tune matrix multiply kernel
- _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
- // Request memory for LHS and RHS reshape matrix
- _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
- _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
- const GEMMInfo &gemm_info)
-{
- DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const GPUTarget gpu_target = CLScheduler::get().target();
- bool broadcast_bias = gemm_info.broadcast_bias();
-
- GEMMKernelInfo kernel_info;
- kernel_info.m = m;
- kernel_info.n = n;
- kernel_info.k = k;
- kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = gemm_info.activation_info();
-
- // Set the target for the kernels
- _mm_kernel->set_target(gpu_target);
-
- GEMMLHSMatrixInfo lhs_info{};
- GEMMRHSMatrixInfo rhs_info{};
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
-
- // Transpose matrix
- _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
- // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
- // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
- // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
-
- // Configure matrix multiply kernel with no y padding support
- kernel_info.has_pad_y = false;
- _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
- // Configure matrix multiply kernel with y padding support
- kernel_info.has_pad_y = true;
- _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
- // Request memory for RHS reshape matrix
- _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(output);
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
- false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
- return Status{};
-}
-
-Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(output);
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
- const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
- if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
- {
- mult_transpose1xW_width = 4;
- mult_interleave4x4_height = 2;
- }
-
- GEMMRHSMatrixInfo rhs_info;
- rhs_info.n0 = 16 / b->element_size();
- rhs_info.k0 = 1;
- rhs_info.h0 = mult_transpose1xW_width;
- rhs_info.interleave = false;
- rhs_info.transpose = false;
-
- GEMMLHSMatrixInfo lhs_info;
- lhs_info.m0 = 4;
- lhs_info.k0 = 4;
- lhs_info.v0 = mult_interleave4x4_height;
- lhs_info.interleave = true;
- lhs_info.transpose = true;
-
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
- true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
- return Status{};
-}
-
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(output);
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
- DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool broadcast_bias = gemm_info.broadcast_bias();
-
- GEMMKernelInfo kernel_info;
- kernel_info.m = m;
- kernel_info.n = n;
- kernel_info.k = k;
- kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- kernel_info.reinterpret_input_as_3d = false;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = gemm_info.activation_info();
-
- GEMMLHSMatrixInfo lhs_info;
- GEMMRHSMatrixInfo rhs_info;
-
- // Pick up the GEMM configuration
- // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
- const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
- lhs_info = gemm_config.lhs_info;
- rhs_info = gemm_config.rhs_info;
-
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
- return Status{};
-}
-
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(output);
-
- TensorInfo tmp_b_info{};
-
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
- const DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool broadcast_bias = gemm_info.broadcast_bias();
-
- GEMMKernelInfo kernel_info;
- kernel_info.m = m;
- kernel_info.n = n;
- kernel_info.k = k;
- kernel_info.depth_output_gemm3d = depth_output_gemm3d;
- kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = gemm_info.activation_info();
-
- GEMMLHSMatrixInfo lhs_info;
- GEMMRHSMatrixInfo rhs_info;
-
- // Pick up the GEMM configuration
- // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
- const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
- lhs_info = gemm_config.lhs_info;
- rhs_info = gemm_config.rhs_info;
-
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
- // Validate matrix multiply
- kernel_info.has_pad_y = false;
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
- kernel_info.has_pad_y = true;
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
- return Status{};
-}
-
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
-
- // Check if we need to reshape the matrix B only on the first run
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-
- // Select GEMMType
- _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
- gemm_info.constant_weights());
-
- const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
- ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
-
- switch(_gemm_kernel_type)
- {
- case CLGEMMKernelType::NATIVE_V1:
- {
- configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
- break;
- }
- case CLGEMMKernelType::RESHAPED_V1:
- {
- configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
- break;
- }
- case CLGEMMKernelType::RESHAPED:
- {
- configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
- break;
- }
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- {
- configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("GEMMType not supported");
- }
- }
-}
-
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- // Get the GPU target
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const unsigned int n = b->dimension(0);
- const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-
- // Select GEMMType
- CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
- {
- CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
- },
- gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());
-
- const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
- const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
-
- switch(gemm_kernel_type)
- {
- case CLGEMMKernelType::NATIVE_V1:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
- break;
- }
- case CLGEMMKernelType::RESHAPED_V1:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
- break;
- }
- case CLGEMMKernelType::RESHAPED:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
- break;
- }
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
- break;
- }
- default:
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
- }
- }
-
- return Status{};
-}
-
-void ClGemm::run(ITensorPack &tensors)
-{
- const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
- const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
- const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);
- ITensor *dst = tensors.get_tensor(ACL_DST);
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
-
- CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);
- CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
-
- // Prepare the consts if needed
- prepare(tensors);
-
- // Run matrix multiply kernel
- switch(_gemm_kernel_type)
- {
- case CLGEMMKernelType::NATIVE_V1:
- {
- CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);
- break;
- }
- case CLGEMMKernelType::RESHAPED_V1:
- case CLGEMMKernelType::RESHAPED:
- {
- // Run interleave kernel
- ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
- CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
- CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
- }
-
- ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
- if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
- {
- CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
- }
- else
- {
- CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);
- }
- break;
- }
- case CLGEMMKernelType::RESHAPED_ONLY_RHS:
- {
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
- CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
- }
- // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
- // Check if the lhs or dst tensors have padding
- const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
- const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
- bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
-
- ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
- if(has_pad_y)
- {
- CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
- }
- else
- {
- CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("GEMMType not supported");
- }
- }
-}
-
-void ClGemm::prepare(ITensorPack &constants)
-{
- const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
- ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
-
- // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
- if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
- {
- CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
- ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
-
- ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
- CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
- }
-}
-
-experimental::MemoryRequirements ClGemm::workspace() const
-{
- return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClGemm.h b/src/runtime/gpu/cl/operators/ClGemm.h
deleted file mode 100644
index bd9ca17edf..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemm.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_H
-#define ARM_COMPUTE_CL_GEMM_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTypes.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
- *
- * -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
- * -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- * -# @ref kernels::ClGemmMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method())
- * -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method())
- * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- */
-class ClGemm : public IClOperator
-{
-public:
- /** Constructor */
- ClGemm();
- /** Initialise the kernel's inputs and output
- *
- * Valid data layouts:
- * - All
- *
- * Valid data type configurations:
- * |src0 |src1 |src2 |dst |
- * |:------------|:-----------|:---------|:--------------|
- * |F32 |F32 |F32 |F32 |
- * |F16 |F16 |F16 |F16 |
- *
- * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
- *
- * @note All tensors must have the same data type.
- *
- * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32
- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a.
- * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
- * @param[out] output Output tensor. Data type supported: same as @p a
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of matrix C
- * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
- * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
- * in case matrix A and matrix B have been already transformed.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to ClGemm::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- void configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- void configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- void configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
- static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
- static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-private:
- enum AuxTensorIdx
- {
- LhsReshape = 0,
- RhsReshape,
- Count
- };
-
-private:
- std::unique_ptr<kernels::ClGemmMatrixMultiplyKernel> _mm_kernel;
- std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel> _reshape_lhs_kernel;
- std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _reshape_rhs_kernel;
- std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel;
- std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
- std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel;
- TensorInfo _tmp_a;
- TensorInfo _tmp_b;
- bool _reshape_b_only_on_first_run;
- CLGEMMKernelType _gemm_kernel_type;
-
- experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMM_H */
diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp b/src/runtime/gpu/cl/operators/ClLogicalNot.cpp
deleted file mode 100644
index 400efe450d..0000000000
--- a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClLogicalNot.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
- k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
- _kernel = std::move(k);
-}
-
-Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.h b/src/runtime/gpu/cl/operators/ClLogicalNot.h
deleted file mode 100644
index 25ddf564b5..0000000000
--- a/src/runtime/gpu/cl/operators/ClLogicalNot.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H
-#define ARM_COMPUTE_CL_LOGICAL_NOT_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */
-class ClLogicalNot : public IClOperator
-{
-public:
- /** Constructor */
- ClLogicalNot() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: U8.
- * @param[out] dst Destination tensor info. Data types supported: same as @p src.
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * @param[in] src Soure tensor info. Data types supported: U8.
- * @param[in] dst Destination tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */
diff --git a/src/runtime/gpu/cl/operators/ClMul.cpp b/src/runtime/gpu/cl/operators/ClMul.cpp
deleted file mode 100644
index d1e2bc806f..0000000000
--- a/src/runtime/gpu/cl/operators/ClMul.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClMul.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClMulKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClMulKernel>();
- k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
- _kernel = std::move(k);
-}
-
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
- return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
-}
-
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClComplexMulKernel>();
- k->configure(compile_context, src1, src2, dst, act_info);
- _kernel = std::move(k);
-}
-
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClMul.h b/src/runtime/gpu/cl/operators/ClMul.h
deleted file mode 100644
index 4a662b3276..0000000000
--- a/src/runtime/gpu/cl/operators/ClMul.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_MUL_H
-#define ARM_COMPUTE_CL_MUL_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref opencl::kernels::ClMulKernel */
-class ClMul : public IClOperator
-{
-public:
- /** Default Constructor */
- ClMul() = default;
- /** Initialise the kernel's sources, dst and convertion policy.
- *
- * Valid configurations (src1,src2) -> Output :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,U8) -> S16
- * - (S16,S16) -> S16
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
- * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClMul::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
-class ClComplexMul : public IClOperator
-{
-public:
- /** Default Constructor */
- ClComplexMul() = default;
- /** Initialise the kernel's sources, dst.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2.
- * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
- * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClComplexMul::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_MUL_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPRelu.cpp b/src/runtime/gpu/cl/operators/ClPRelu.cpp
deleted file mode 100644
index d1ce14cc87..0000000000
--- a/src/runtime/gpu/cl/operators/ClPRelu.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPRelu.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
-{
- auto k = std::make_unique<KernelType>();
- k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
- _kernel = std::move(k);
-}
-
-Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
- return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
-}
-
-void ClPRelu::run(ITensorPack &tensors)
-{
- // Output tensor can be given as nullptr for in-place computation.
- // In this case, get the input tensor and use it as the output tensor.
- if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
- {
- auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
- tensors.add_tensor(TensorType::ACL_DST, src_tensor);
- }
- IClOperator::run(tensors);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPRelu.h b/src/runtime/gpu/cl/operators/ClPRelu.h
deleted file mode 100644
index 70202aeb81..0000000000
--- a/src/runtime/gpu/cl/operators/ClPRelu.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PRELU_H
-#define ARM_COMPUTE_CL_PRELU_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
- *
- * @note The operator implements an activation layer with the PRELU activation function.
- */
-class ClPRelu : public IClOperator
-{
-public:
- /** Default constructor */
- ClPRelu() = default;
- /** Set the input and output tensor.
- *
- * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input.
- * @param[out] output Destination tensor. Data type supported: same as @p input
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
- /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
- *
- * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input.
- * @param[in] output Destination tensor info. Data type supported: same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PRELU_H */
diff --git a/src/runtime/gpu/cl/operators/ClPermute.cpp b/src/runtime/gpu/cl/operators/ClPermute.cpp
deleted file mode 100644
index 719bb6dac6..0000000000
--- a/src/runtime/gpu/cl/operators/ClPermute.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPermuteKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
- auto k = std::make_unique<kernels::ClPermuteKernel>();
- k->configure(compile_context, src, dst, perm);
- _kernel = std::move(k);
-}
-
-Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
- return kernels::ClPermuteKernel::validate(src, dst, perm);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPermute.h b/src/runtime/gpu/cl/operators/ClPermute.h
deleted file mode 100644
index 20e7a32428..0000000000
--- a/src/runtime/gpu/cl/operators/ClPermute.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PERMUTE_H
-#define ARM_COMPUTE_CL_PERMUTE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClPermuteKernel */
-class ClPermute : public IClOperator
-{
-public:
- /** Constructor */
- ClPermute() = default;
- /** Initialise the kernel's inputs and outputs and permute vector
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src The src tensor info. Data types supported: All.
- * @param[in] dst The dst tensor info. Data types supported: Same as @p src
- * @param[in] perm Permutation vector
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClPermuteKernel.
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] src First tensor src info. Data types supported: All.
- * @param[in] dst Output tensor info. Data types supported: same as @p src.
- * @param[in] perm Permutation vector
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp
deleted file mode 100644
index 40c2b0a8ba..0000000000
--- a/src/runtime/gpu/cl/operators/ClPool2d.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPool2d.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
- // Configure pooling kernel
- auto k = std::make_unique<kernels::ClPool2dKernel>();
- k->set_target(CLScheduler::get().target());
- k->configure(compile_context, src, dst, info, indices);
- _pooling = std::move(k);
-
- const DataType data_type = src->data_type();
-
- // Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode{};
- PixelValue pixel_value(0.f);
- if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
- {
- pixel_value = PixelValue(0, data_type, src->quantization_info());
- }
-
- // Data layout
- const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
- switch(data_layout)
- {
- case DataLayout::NCHW:
- border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- break;
- case DataLayout::NHWC:
- border_mode = BorderMode::CONSTANT;
- if(PoolingType::MAX == info.pool_type)
- {
- if(is_data_type_quantized(data_type))
- {
- std::tie(pixel_value, std::ignore) = get_min_max(data_type);
- }
- else
- {
- pixel_value = PixelValue(std::numeric_limits<float>::lowest());
- }
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
- _border_handler = std::move(b);
-
- // Tune kernels
- CLScheduler::get().tune_kernel_static(*_pooling);
-}
-
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
-{
- return kernels::ClPool2dKernel::validate(src, dst, info, indices);
-}
-
-void ClPool2d::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
- CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
- CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h
deleted file mode 100644
index 8ac386a64b..0000000000
--- a/src/runtime/gpu/cl/operators/ClPool2d.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOL2D_H
-#define ARM_COMPUTE_CL_POOL2D_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClPool2d
- */
-class ClPool2d : public IClOperator
-{
-public:
- /** Constructor */
- ClPool2d() = default;
- /** Configure operator for a given list of arguments
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] info Pooling layer parameters.
- * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to ClPool2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
-
-private:
- std::unique_ptr<ICLKernel> _pooling{ nullptr };
- std::unique_ptr<ICLKernel> _border_handler{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.cpp b/src/runtime/gpu/cl/operators/ClQuantize.cpp
deleted file mode 100644
index 92bbb62ba5..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantize.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClQuantize.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClQuantizeKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClQuantizeKernel::validate(src, dst);
-}
-
-void ClQuantize::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.h b/src/runtime/gpu/cl/operators/ClQuantize.h
deleted file mode 100644
index 0b6d2c8cbe..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantize.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZE_H
-#define ARM_COMPUTE_CL_QUANTIZE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
-class ClQuantize : public IClOperator
-{
-public:
- /** Constructor */
- ClQuantize() = default;
- /** Set the input and output tensors.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
- * @param[out] dst Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
- *
- * @note Output auto initialization is not supported by this function
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref ClQuantize::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClReshape.cpp b/src/runtime/gpu/cl/operators/ClReshape.cpp
deleted file mode 100644
index d3fa9f10ab..0000000000
--- a/src/runtime/gpu/cl/operators/ClReshape.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClReshape.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClReshapeKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClReshapeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClReshape.h b/src/runtime/gpu/cl/operators/ClReshape.h
deleted file mode 100644
index 8cccc5776c..0000000000
--- a/src/runtime/gpu/cl/operators/ClReshape.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_RESHAPE_H
-#define ARM_COMPUTE_CL_RESHAPE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClReshapeKernel */
-class ClReshape : public IClOperator
-{
-public:
- /** Constructor */
- ClReshape() = default;
- /** Initialise the kernel's inputs and outputs
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input Input tensor info. Data type supported: All
- * @param[out] output Output info. Data type supported: Same as @p input
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
-
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClReshapeKernel
- *
- * @param[in] input Input tensor info. Data type supported: All
- * @param[in] output Output tensor info. Data type supported: Same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_RESHAPE_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp
deleted file mode 100644
index 4730c8a16e..0000000000
--- a/src/runtime/gpu/cl/operators/ClScale.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClScale.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClScaleKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
- // Configure Scale kernel
- auto k = std::make_unique<kernels::ClScaleKernel>();
- k->set_target(CLScheduler::get().target());
- k->configure(compile_context, src, dst, info);
- _kernel = std::move(k);
- if(!_kernel->border_size().empty())
- {
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, src, _kernel->border_size(), info.border_mode, info.constant_border_value);
- _border_handler = std::move(b);
- }
- // Tune kernel
- CLScheduler::get().tune_kernel_static(*_kernel);
-}
-
-Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- return kernels::ClScaleKernel::validate(src, dst, info);
-}
-
-void ClScale::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- if(!_kernel->border_size().empty())
- {
- CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
- }
- CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h
deleted file mode 100644
index 6eccb59be8..0000000000
--- a/src/runtime/gpu/cl/operators/ClScale.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SCALE_H
-#define ARM_COMPUTE_CL_SCALE_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::ClScaleKernel
- */
-class ClScale : public IClOperator
-{
-public:
- /** Constructor */
- ClScale() = default;
- /** Initialize the function's source, destination, interpolation type and border_mode.
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in,out] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
- * @param[out] dst Destination tensor info. Data types supported: Same as @p src
- * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure
- */
- void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
-
- /** Static function to check if given info will lead to a valid configuration of @ref ClScale
- *
- * @param[in] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32.
- * @param[in] dst Output tensor info. Data type supported: Same as @p src
- * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo descriptor to be used to validate
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
-
-protected:
- std::unique_ptr<ICLKernel> _border_handler{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALE_H */
diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.cpp b/src/runtime/gpu/cl/operators/ClSoftmax.cpp
deleted file mode 100644
index 975bb0b932..0000000000
--- a/src/runtime/gpu/cl/operators/ClSoftmax.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClSoftmax.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace opencl
-{
-ClSoftmax::ClSoftmax()
- : _permute_input(std::make_unique<ClPermute>()),
- _permute_output(std::make_unique<ClPermute>()),
- _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()),
- _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()),
- _max_info(),
- _sum_info(),
- _tmp_info(),
- _permuted_src_info(),
- _permuted_dst_info(),
- _aux_mem(InternalTensorIdx::COUNT)
-{
-}
-
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
-
- const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-
- _needs_permute = actual_axis != 0;
-
- const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src;
- ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
-
- if(_needs_permute)
- {
- const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
- }
-
- DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
- _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
-
- TensorShape max_sum_shape = tmp_input_info.tensor_shape();
- _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
- _sum_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type);
-
- // Set GPU target to kernels
- _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
-
- _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
- _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
-
- if(_needs_permute)
- {
- const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
- }
-
- _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
- _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
- _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
- _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
-}
-
-Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
- ARM_COMPUTE_UNUSED(info.beta);
- ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
-
- const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
- const bool needs_permute = actual_axis != 0;
- if(needs_permute)
- {
- const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
- TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
- TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
- }
-
- // Create intermediate tensor info
- DataType tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type();
- TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
- TensorShape max_sum_shape = src.tensor_shape();
- max_sum_shape.set(0, 1);
- TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
- TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
-
- return Status{};
-}
-
-void ClSoftmax::run(ITensorPack &tensors)
-{
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false);
- CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
- CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
-
- CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
- CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
-
- if(_needs_permute)
- {
- ITensorPack pack;
- pack.add_const_tensor(TensorType::ACL_SRC, src);
- pack.add_tensor(TensorType::ACL_DST, permuted_src.get());
- _permute_input.get()->run(pack);
- }
-
- ITensorPack sum_pack;
- ITensorPack norm_pack;
- if(_needs_permute)
- {
- sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
- norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
- }
- else
- {
- sum_pack.add_const_tensor(TensorType::ACL_SRC, src);
- norm_pack.add_tensor(TensorType::ACL_DST, dst);
- }
- sum_pack.add_tensor(TensorType::ACL_DST, tmp.get());
- sum_pack.add_tensor(TensorType::ACL_INT_0, max.get());
- sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get());
-
- norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get());
- norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get());
-
- CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
- CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
-
- if(_needs_permute)
- {
- ITensorPack pack;
- pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
- pack.add_tensor(TensorType::ACL_DST, dst);
- _permute_output.get()->run(pack);
- }
-}
-
-experimental::MemoryRequirements ClSoftmax::workspace() const
-{
- return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.h b/src/runtime/gpu/cl/operators/ClSoftmax.h
deleted file mode 100644
index f19a51fc5e..0000000000
--- a/src/runtime/gpu/cl/operators/ClSoftmax.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_H
-#define ARM_COMPUTE_CL_SOFTMAX_H
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-struct SoftmaxKernelInfo;
-
-namespace opencl
-{
-class ClPermute;
-namespace kernels
-{
-class ClLogits1DMaxShiftExpSumKernel;
-class ClLogits1DNormKernel;
-} // namespace kernels
-class ClSoftmax : public IClOperator
-{
-public:
- /** Constructor */
- ClSoftmax();
- /** Configure the operator
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
- * @param[out] dst Destination tensor info. Data types supported: same as @p src
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- *
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
- /** Static function to check if the given info will lead to a valid configuration
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
- * @param[out] dst Destination tensor info. Data types supported: same as @p src
- * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
- *
- */
- static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- enum InternalTensorIdx
- {
- MAX = 0,
- SUM,
- TMP,
- PERMUTED_SRC,
- PERMUTED_DST,
- COUNT
- };
-
- std::unique_ptr<ClPermute> _permute_input;
- std::unique_ptr<ClPermute> _permute_output;
- std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
- std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel;
- bool _needs_permute{ false };
-
- TensorInfo _max_info;
- TensorInfo _sum_info;
- TensorInfo _tmp_info;
- TensorInfo _permuted_src_info;
- TensorInfo _permuted_dst_info;
-
- experimental::MemoryRequirements _aux_mem{};
-};
-
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClSub.cpp b/src/runtime/gpu/cl/operators/ClSub.cpp
deleted file mode 100644
index 429f23a837..0000000000
--- a/src/runtime/gpu/cl/operators/ClSub.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClSub.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
- k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
- _kernel = std::move(k);
-}
-
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
- return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClSub.h b/src/runtime/gpu/cl/operators/ClSub.h
deleted file mode 100644
index bcad84d583..0000000000
--- a/src/runtime/gpu/cl/operators/ClSub.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SUB_H
-#define ARM_COMPUTE_CL_SUB_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run arithmetic subtraction
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @note The function performs an arithmetic subtraction between two tensors.
- */
-class ClSub : public IClOperator
-{
-public:
- /** Default Constructor */
- ClSub() = default;
- /** Configure function for a given list of arguments.
- *
- * Valid configurations (src1,src2) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref ClSub
- *
- * Valid configurations (src1,src2) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SUB_H */
diff --git a/src/runtime/gpu/cl/operators/ClTranspose.cpp b/src/runtime/gpu/cl/operators/ClTranspose.cpp
deleted file mode 100644
index 48f44282e8..0000000000
--- a/src/runtime/gpu/cl/operators/ClTranspose.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClTranspose.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
- auto k = std::make_unique<kernels::ClTransposeKernel>();
- k->configure(compile_context, src, dst);
- _kernel = std::move(k);
-}
-
-Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::ClTransposeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClTranspose.h b/src/runtime/gpu/cl/operators/ClTranspose.h
deleted file mode 100644
index d898f677ca..0000000000
--- a/src/runtime/gpu/cl/operators/ClTranspose.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_TRANSPOSE_H
-#define ARM_COMPUTE_CL_TRANSPOSE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClTransposeKernel */
-class ClTranspose : public IClOperator
-{
-public:
- /** Constructor */
- ClTranspose() = default;
- /** Initialise the kernel's inputs and outputs
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src The src tensor info. Data types supported: All.
- * @param[in] dst The dst tensor info. Data types supported: Same as @p src
- */
- void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref kernels::ClTransposeKernel.
- *
- * @param[in] src First tensor src info. Data types supported: All.
- * @param[in] dst Output tensor info. Data types supported: same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
deleted file mode 100644
index c8db697778..0000000000
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
-{
- Size2D output_tile = Size2D{};
-
- const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
-
- // Check if the input spatial dimensions are smaller than 4
- const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
-
- if(kernel_max_dim == 3U)
- {
- if(kernel_dims == Size2D(3U, 3U))
- {
- output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
- }
- else if(kernel_dims == Size2D(3U, 1U))
- {
- output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
- }
- else
- {
- output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
- }
- }
- else if(kernel_max_dim == 5U)
- {
- output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
- kernel_dims.height == 1 ? 1U : 4U);
- }
- else if(kernel_max_dim == 7U)
- {
- output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
- kernel_dims.height == 1 ? 1U : 2U);
- }
-
- return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
-{
- // Check if we want to configure a Winograd configuration which requires fast math
- using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
- std::vector<WinogradConfiguration> fast_math_winograd =
- {
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
- WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
- };
-
- auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
- std::pair<int, int>(kernel_size.width, kernel_size.height));
-
- return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
- // Get indeces for the width and height
- const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-
- // Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
- }
-
- const WinogradInfo winograd_info = WinogradInfo(output_tile,
- kernel_size,
- input_dims,
- conv_info,
- src->data_layout());
-
- // Validate input transform
- const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
- const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
-
- // Validate filter transform
- const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
- const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
-
- // Validate batched matrix multiply
- TensorShape batched_mm_output_shape = input0.tensor_shape();
- batched_mm_output_shape[0] = input1.tensor_shape()[0];
- const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
- ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
- GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
-
- // Configure output transform
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
- return Status{};
-}
-
-} // namespace
-
-ClWinogradConv2d::ClWinogradConv2d()
- : _batched_mm(),
- _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()),
- _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()),
- _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()),
- _border_handler(),
- _input0(),
- _input1(),
- _batched_mm_output(),
- _is_prepared(false),
- _aux_mem()
-{
-}
-
-ClWinogradConv2d::~ClWinogradConv2d() = default;
-
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
- // Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-
- // Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
- ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
- }
- const WinogradInfo winograd_info = WinogradInfo(output_tile,
- kernel_size,
- input_dims,
- conv_info,
- src->data_layout());
-
- _is_prepared = false;
-
- // Configure input transform
- _input_transform->configure(compile_context, src, &_input0, winograd_info);
- _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
-
- // Configure filter transform
- _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
-
- // Configure batched matrix multiply
- _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
- false, false,
- GEMMLowpOutputStageInfo(),
- (src->data_type() == DataType::F16)));
-
- // Configure output transform
- _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
-
- _aux_mem = _batched_mm.workspace();
- _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
- _aux_mem.push_back(MemoryInfo(offset_int_vec(3), MemoryLifetime::Persistent, _input1.total_size()));
- _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
-}
-
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
- return Status{};
-}
-
-void ClWinogradConv2d::run(ITensorPack &tensors)
-{
- prepare(tensors);
-
- // Run input transform
- auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
- auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
- CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
- CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true);
- CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
-
- ITensorPack pack_it
- {
- { TensorType::ACL_SRC, src },
- { TensorType::ACL_DST, input0.get() },
- };
- CLScheduler::get().enqueue_op(_border_handler, pack_it);
- CLScheduler::get().enqueue_op(*_input_transform, pack_it);
-
- // Run batched matrix multiplication
- ITensorPack pack_mm
- {
- { TensorType::ACL_SRC_0, input0.get() },
- { TensorType::ACL_SRC_1, input1.get() },
- { TensorType::ACL_DST, batched_mm_output.get() },
- };
- _batched_mm.run(pack_mm);
-
- // Run output transform
- ITensorPack pack_ot
- {
- { TensorType::ACL_SRC_0, batched_mm_output.get() },
- { TensorType::ACL_SRC_1, biases },
- { TensorType::ACL_DST, dst },
- };
- CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
-}
-
-void ClWinogradConv2d::prepare(ITensorPack &tensors)
-{
- if(!_is_prepared)
- {
- auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
-
- CLAuxTensorHandler input1(_input1, *in1_aux);
- ITensorPack pack_ft
- {
- { TensorType::ACL_SRC, weights },
- { TensorType::ACL_DST, input1.get() },
- };
- // Run filter transform and mark original weights as unused
- CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
- weights->mark_as_unused();
-
- tensors.add_tensor(ACL_SRC_1, input1.get());
- // Prepare GEMM and release reshaped weights if marked unused by ClGemm
- _batched_mm.prepare(tensors);
-
- CLScheduler::get().queue().finish();
- _is_prepared = true;
- }
-}
-
-experimental::MemoryRequirements ClWinogradConv2d::workspace() const
-{
- return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h b/src/runtime/gpu/cl/operators/ClWinogradConv2d.h
deleted file mode 100644
index 83b31f1c99..0000000000
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H
-#define ARM_COMPUTE_CL_WINOGRADCONV2D_H
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ITensorInfo;
-namespace opencl
-{
-namespace kernels
-{
-class ClWinogradInputTransformKernel;
-class ClWinogradFilterTransformKernel;
-class ClWinogradOutputTransformKernel;
-} // kernels
-/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
- *
- * -# @ref kernels::ClWinogradInputTransformKernel
- * -# @ref kernels::ClWinogradFilterTransformKernel (only once)
- * -# @ref ClGemm
- * -# @ref kernels::ClWinogradOutputTransformKernel
- *
- */
-class ClWinogradConv2d : public IClOperator
-{
-public:
- /** Default constructor */
- ClWinogradConv2d();
- /** Default destructor */
- ~ClWinogradConv2d();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- ClWinogradConv2d(const ClWinogradConv2d &) = delete;
- /** Default move constructor */
- ClWinogradConv2d(ClWinogradConv2d &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete;
- /** Default move assignment operator */
- ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default;
- /** Set the input and output tensors.
- *
- * Valid data layouts:
- * - NHWC
- * - NCHW
- *
- * Valid data type configurations:
- * |src0 |src1 |src2 |dst |
- * |:--------------|:--------------|:------|:--------------|
- * |F16 |F16 |F16 |F16 |
- * |F32 |F32 |F32 |F32 |
- *
- * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
- * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: F16/F32.
- * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src
- * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p src.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
- * available which may introduce a drop of accuracy as well. Default is false
- */
- void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to ClWinogradConv2d::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-
- // Inherited method overridden
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
- experimental::MemoryRequirements workspace() const override;
-
-private:
- ClGemm _batched_mm;
- std::unique_ptr<kernels::ClWinogradInputTransformKernel> _input_transform;
- std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform;
- std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform;
- CLFillBorderKernel _border_handler;
- TensorInfo _input0;
- TensorInfo _input1;
- TensorInfo _batched_mm_output;
- bool _is_prepared;
- experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */
diff --git a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
deleted file mode 100644
index 152e3c6c04..0000000000
--- a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
-
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
-class CLAuxTensorHandler
-{
-public:
- CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
- : _tensor()
- {
- _tensor.allocator()->soft_init(info);
-
- ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
- if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
- {
- _tensor.allocator()->allocate();
- if(pack_inject)
- {
- pack.add_tensor(slot_id, &_tensor);
- _injected_tensor_pack = &pack;
- _injected_slot_id = slot_id;
- }
- }
- else
- {
- _tensor.allocator()->import_memory(packed_tensor->cl_buffer());
- }
- }
-
- CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
- : _tensor()
- {
- _tensor.allocator()->soft_init(info);
- if(info.total_size() <= tensor.info()->total_size())
- {
- _tensor.allocator()->import_memory(tensor.cl_buffer());
- }
- }
-
- CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
- CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
-
- ~CLAuxTensorHandler()
- {
- if(_injected_tensor_pack)
- {
- _injected_tensor_pack->remove_tensor(_injected_slot_id);
- }
- }
-
- ICLTensor *get()
- {
- return &_tensor;
- }
-
- ICLTensor *operator()()
- {
- return &_tensor;
- }
-
-private:
- CLTensor _tensor{};
- ITensorPack *_injected_tensor_pack{ nullptr };
- int _injected_slot_id{ TensorType::ACL_UNKNOWN };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */ \ No newline at end of file
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..aba32871d0
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+ const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+ ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+ &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+ &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+ ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+ &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+ &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G71:
+ func = configs_G71.get_function(src->data_type());
+ break;
+ default:
+ func = configs_default.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+ return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 2;
+ }
+
+ desc.k0 = 8;
+
+ desc.export_weights_to_cl_image = false;
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 4;
+ }
+
+ desc.k0 = 8;
+
+ desc.export_weights_to_cl_image = false;
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 4;
+ }
+
+ desc.k0 = 16;
+
+ desc.export_weights_to_cl_image = false;
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 2;
+ }
+
+ desc.k0 = 8;
+
+ desc.export_weights_to_cl_image = export_to_cl_image(wei);
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 4;
+ }
+
+ desc.k0 = 8;
+
+ desc.export_weights_to_cl_image = export_to_cl_image(wei);
+ }
+
+ return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
new file mode 100644
index 0000000000..ed6a4c3c68
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Bifrost based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ DirectConvComputeKernelInfo
+ configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+ DirectConvComputeKernelInfo
+ configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..4b7666d5aa
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+ const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+ ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+ &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+ &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+ ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+ &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+ &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G57:
+ func = configs_G57.get_function(src->data_type());
+ break;
+ case GPUTarget::G78:
+ default:
+ func = configs_G78.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+ return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ const TensorShape wei_shape = wei->tensor_shape();
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+
+ const int32_t ofm = dst_shape[0];
+ const int32_t m = dst_shape[1] * dst_shape[2];
+ const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (dst_shape[0] <= 4)
+ {
+ if (is_pointwise)
+ {
+ if (ofm == 4)
+ {
+ desc.m0 = 1;
+ desc.n0 = 4;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = 2;
+ desc.k0 = 16;
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 4;
+ desc.n0 = 4;
+ desc.k0 = 4;
+ }
+ }
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ const TensorShape wei_shape = wei->tensor_shape();
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+
+ const int32_t ofm = dst_shape[0];
+ const int32_t m = dst_shape[1] * dst_shape[2];
+ const int32_t k = wei_shape[0];
+ const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (dst_shape[0] <= 4)
+ {
+ // k0 should be as larger as possible. However, we should avoid
+ // having left-over for loops that make the implementation slower.
+ if ((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if ((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
+
+ if (is_pointwise)
+ {
+ if (ofm == 4)
+ {
+ desc.m0 = 1;
+ desc.n0 = 4;
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = dst_shape[0];
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ if ((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if ((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
+ }
+ else
+ {
+ if (ofm >= 16)
+ {
+ if (m / 6 > 24000)
+ {
+ desc.m0 = 6;
+ }
+ else
+ {
+ desc.m0 = 5;
+ }
+ desc.n0 = 8;
+ desc.k0 = 4;
+ }
+ else
+ {
+ desc.m0 = 2;
+ desc.n0 = 8;
+ if ((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if ((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
+ }
+ }
+ }
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+ desc.n0 = 4;
+
+ if (output_shape[0] > 16)
+ {
+ desc.m0 = 4;
+ }
+
+ desc.k0 = 16;
+
+ desc.export_weights_to_cl_image = false;
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ const TensorShape wei_shape = wei->tensor_shape();
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+
+ const int32_t m = dst_shape[1] * dst_shape[2];
+ const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (dst_shape[0] <= 4)
+ {
+ if (is_pointwise)
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = dst_shape[0];
+ desc.k0 = 16;
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ if (m == 1)
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 4;
+ desc.n0 = 2;
+ desc.k0 = 8;
+ }
+ }
+ else
+ {
+ desc.m0 = 4;
+ desc.n0 = 4;
+ desc.k0 = 4;
+ }
+ }
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Get the output shape
+ const TensorShape wei_shape = wei->tensor_shape();
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+
+ const int32_t ofm = dst_shape[0];
+ const int32_t m = dst_shape[1] * dst_shape[2];
+ const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (dst_shape[0] <= 4)
+ {
+ if (is_pointwise)
+ {
+ desc.m0 = 2;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 1;
+ desc.n0 = dst_shape[0];
+ desc.k0 = 16;
+ }
+ }
+ else
+ {
+ if (m < 64)
+ {
+ if (m == 1)
+ {
+ desc.m0 = 1;
+ desc.n0 = 1;
+ desc.k0 = 16;
+ }
+ else
+ {
+ desc.m0 = 4;
+ desc.n0 = 2;
+ desc.k0 = 8;
+ }
+ }
+ else
+ {
+ if (ofm > 16)
+ {
+ desc.m0 = 4;
+ desc.n0 = 8;
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.m0 = 8;
+ desc.n0 = 4;
+ desc.k0 = 4;
+ }
+ }
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
new file mode 100644
index 0000000000..efd879a567
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Valhall based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDirectConvDefaultConfigValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ DirectConvComputeKernelInfo
+ configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+ DirectConvComputeKernelInfo
+ configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..215b17ef79
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** ClDirectConvolution factory class */
+class ClDirectConvKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClDirectConvKernelConfig
+ */
+ static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..e5b270c720
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Basic container for the OpenCL direct convolution configuration functions */
+template <class T>
+class ClDirectConvConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+ /** Alias for Int8 index */
+ static constexpr size_t DT_INT8 = 2;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for direct convolution F32
+ * @param[in] func_f16 Function to call for direct convolution F16
+ * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+ {
+ }
+
+ /** Method to return the direct convolution configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_INT8);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 3> _configs;
+};
+
+/** Basic interface for the Direct convolution kernel configuration */
+class IClDirectConvKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClDirectConvKernelConfig() = default;
+ /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+ *
+ * @param[in] src Source tensor (activation tensor)
+ * @param[in] wei Weights tensor
+ * @param[in] conv_info Convolution info
+ */
+ virtual DirectConvComputeKernelInfo
+ configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..98ebf3ebbe
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+namespace
+{
+DWCComputeKernelInfo configure_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ bool is_g71)
+{
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+
+ if (is_g71)
+ {
+ desc.export_weights_to_cl_image = false;
+ }
+ else
+ {
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+ }
+
+ if (depth_multiplier == 1)
+ {
+ desc.n0 = 4;
+ }
+ else
+ {
+ if ((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if ((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+ (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if (conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if ((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo configure_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ bool is_g71)
+{
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Src and weights have the same dimension indices
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape src_shape = src->tensor_shape();
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t src_w = src_shape[idx_w];
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+
+ if (is_g71)
+ {
+ desc.export_weights_to_cl_image = false;
+ }
+ else
+ {
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+ }
+
+ if (depth_multiplier == 1)
+ {
+ if (desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if ((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if ((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+ (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if (conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if ((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ if ((src_w % 5) == 0)
+ {
+ desc.m0 = 5;
+ }
+ else
+ {
+ desc.m0 = 4;
+ }
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace
+
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+ const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+ &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G71:
+ func = configs_G71.get_function(src->data_type());
+ break;
+ default:
+ func = configs_G7x.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+ return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_UNUSED(wei);
+
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = false;
+ desc.n0 = (depth_multiplier == 1) ? 4 : 1;
+ if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+ {
+ desc.m0 = 2;
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
new file mode 100644
index 0000000000..41d86c9c14
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Bifrost based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ DWCComputeKernelInfo configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier) override;
+
+private:
+ DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..ef1bb3858c
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+ const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+ &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+ &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+ &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+ &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G77:
+ func = configs_G77.get_function(src->data_type());
+ break;
+ case GPUTarget::G78:
+ default:
+ func = configs_G78.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+ return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if (depth_multiplier == 1)
+ {
+ desc.n0 = 4;
+ }
+ else
+ {
+ if ((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if ((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+ (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if (conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if ((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ // Src and weights have the same dimension indices
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape src_shape = src->tensor_shape();
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t src_w = src_shape[idx_w];
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if (depth_multiplier == 1)
+ {
+ if (desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if ((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if ((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+ (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if (conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if ((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ if ((src_w % 5) == 0)
+ {
+ desc.m0 = 5;
+ }
+ else
+ {
+ desc.m0 = 4;
+ }
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_UNUSED(wei);
+
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = false;
+ desc.n0 = (depth_multiplier == 1) ? 4 : 1;
+ if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+ {
+ desc.m0 = 2;
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if (depth_multiplier == 1)
+ {
+ if (desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if ((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if ((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+ (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if (conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if ((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..fabce77b54
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Valhall based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ DWCComputeKernelInfo configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier) override;
+
+private:
+ DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/cpu/operators/CpuDequantize.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index 80a2e28aee..c8b006c546 100644
--- a/src/runtime/cpu/operators/CpuDequantize.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,34 +21,41 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/runtime/cpu/operators/CpuDequantize.h"
-
+#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
+#include "arm_compute/core/TensorShape.h"
namespace arm_compute
{
-namespace cpu
+namespace cl_dwc
{
-void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
{
- auto k = std::make_unique<kernels::CpuDequantizeKernel>();
- k->configure(src, dst);
- _kernel = std::move(k);
-}
+ // Check whether we can use the cl image with the weights.
+ if (!export_to_cl_image(weights))
+ {
+ return false;
+ }
-Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- return kernels::CpuDequantizeKernel::validate(src, dst);
-}
+ const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t kernel_w = weights->tensor_shape()[idx_w];
+ const size_t kernel_h = weights->tensor_shape()[idx_h];
-void CpuDequantize::run(ITensorPack &tensors)
-{
- ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- prepare(tensors);
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+ // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
+ // 1- When the kernel size is 1x1
+ // 2- When the depth multiplier is greater than 1 and not multiple of 4.
+ if ((kernel_w == 1) && (kernel_h == 1))
+ {
+ return false;
+ }
+
+ if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+ {
+ return false;
+ }
+
+ return true;
}
-} // namespace cpu
+} // namespace cl_dwc
} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCast.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
index 5a4f6c518e..e3484c04ff 100644
--- a/src/runtime/cpu/operators/CpuCast.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,24 +21,25 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/runtime/cpu/operators/CpuCast.h"
-
-#include "src/core/cpu/kernels/CpuCastKernel.h"
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
namespace arm_compute
{
-namespace cpu
-{
-void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
- auto k = std::make_unique<kernels::CpuCastKernel>();
- k->configure(src, dst, policy);
- _kernel = std::move(k);
-}
+// Forward declaration
+class ITensorInfo;
-Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+namespace cl_dwc
{
- return kernels::CpuCastKernel::validate(src, dst, policy);
-}
-} // namespace cpu
+/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance
+ *
+ * @param[in] weights Weights TensorInfo of the depthwise convolution
+ * @param[in] depth_multiplier Depth multiplier
+ *
+ * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance
+ */
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier);
+
+} // namespace cl_dwc
} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..031cf1859a
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** ClDWCNativeKernelConfigurationFactory factory class */
+class ClDWCNativeKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClDWCNativeKernelConfig
+ */
+ static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ // The heuristic for Midgard is the same as the one used for Arm Mali-G71
+ return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71);
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..614a6622df
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Basic container for the OpenCL depthwise convolution configuration functions */
+template <class T>
+class ClDWCNativeConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+ /** Alias for Int8 index */
+ static constexpr size_t DT_INT8 = 2;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for depthwise convolution F32
+ * @param[in] func_f16 Function to call for depthwise convolution F16
+ * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+ {
+ }
+
+ /** Method to return the depthwise convolution configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_INT8);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 3> _configs;
+};
+
+/** Basic interface for the depthwise convolution kernel configuration */
+class IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClDWCNativeKernelConfig() = default;
+ /** This method returns the @ref DWCComputeKernelInfo for the given inputs
+ *
+ * @param[in] src Source tensor (activation tensor)
+ * @param[in] wei Weights tensor
+ * @param[in] conv_info Convolution info
+ * @param[in] dilation Kernel dilation
+ * @param[in] depth_multiplier Output feature maps multiplier
+ */
+ virtual DWCComputeKernelInfo configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3380d8f1b7
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+ const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+ ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+ &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+
+ // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
+ // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
+ // for the pointwise convolution cases.
+
+ ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type());
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution");
+ return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+ const int32_t stride_x = conv_info.stride().first;
+ const int32_t stride_y = conv_info.stride().second;
+ const int32_t ofm = dst_shape[0];
+ const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (ofm <= 4)
+ {
+ desc.m0 = 1;
+ desc.n0 = 2;
+ desc.k0 = 16;
+ }
+ else
+ {
+ // The 16000 threshold value has been identified as the right
+ // one for using the biggest block size allowed on F32: 5x4x4
+ if (m < 16000)
+ {
+ desc.m0 = 4;
+ desc.n0 = 4;
+ desc.k0 = 4;
+ }
+ else
+ {
+ desc.m0 = 5;
+ desc.n0 = 4;
+ desc.k0 = 4;
+ }
+ }
+ }
+
+ return desc;
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const PadStrideInfo &conv_info)
+{
+ DirectConvComputeKernelInfo desc;
+
+ if (src->data_layout() == DataLayout::NHWC)
+ {
+ const TensorShape wei_shape = wei->tensor_shape();
+ const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+ const bool export_weights_to_cl_image = export_to_cl_image(wei);
+
+ const int32_t ofm = dst_shape[0];
+ const int32_t m = dst_shape[1] * dst_shape[2];
+ const int32_t k = wei_shape[0];
+
+ desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+ if (ofm <= 4)
+ {
+ // k0 should be as larger as possible. However, we should avoid
+ // having left-over for loops that make the implementation slower.
+ if ((k % 16) == 0)
+ {
+ desc.k0 = 16;
+ }
+ else if ((k % 8) == 0)
+ {
+ desc.k0 = 8;
+ }
+ else
+ {
+ desc.k0 = 4;
+ }
+
+ desc.m0 = 1;
+ desc.n0 = ofm;
+ }
+ else
+ {
+ // The 16000 threshold value has been identified as the right
+ // one for using the biggest block size allowed on F16: 8x4
+ if (m >= 16000 && k < 4)
+ {
+ desc.m0 = 8;
+ desc.n0 = 4;
+ desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
+ }
+ else
+ {
+ desc.m0 = 5;
+ desc.n0 = 4;
+ desc.k0 = 8;
+ }
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_indirect_conv
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCopy.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 057bb6efa5..bab808c66c 100644
--- a/src/runtime/cpu/operators/CpuCopy.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,37 +21,35 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_COPY_H
-#define ARM_COMPUTE_CPU_COPY_H
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
-#include "src/runtime/cpu/ICpuOperator.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
namespace arm_compute
{
-namespace cpu
+namespace cl_indirect_conv
{
-/** Basic function to run @ref kernels::CpuCopyKernel */
-class CpuCopy : public ICpuOperator
+/** Valhall based OpenCL indirect convolution configuration */
+class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig
{
public:
- /** Constructor */
- CpuCopy() = default;
- /** Configure operator for a given list of arguments
+ /** Constructor
*
- * @param[in] src Source tensor info. Data type supported: All
- * @param[out] dst Destination info. Data type supported: Same as @p src
+ * @param[in] gpu GPU target
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst);
+ ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuCopy
- *
- * @param[in] src Source tensor info. Data type supported: All
- * @param[in] dst Destination tensor info. Data type supported: Same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+ // Inherited overridden method
+ DirectConvComputeKernelInfo
+ configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+ DirectConvComputeKernelInfo
+ configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+ DirectConvComputeKernelInfo
+ configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
};
-} // namespace cpu
+} // namespace cl_indirect_conv
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_COPY_H */
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..5e7ba6f8e9
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** ClIndirectConvolution factory class */
+class ClIndirectConvKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClIndirectConvKernelConfig
+ */
+ static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::BIFROST:
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..d05da18b58
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** Basic container for the OpenCL indirect convolution configuration functions */
+template <class T>
+class ClIndirectConvConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for indirect convolution F32
+ * @param[in] func_f16 Function to call for indirect convolution F16
+ *
+ */
+ ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
+ {
+ }
+
+ /** Method to return the indirect convolution configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 2> _configs;
+};
+
+/** Basic interface for the indirect convolution kernel configuration */
+class IClIndirectConvKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClIndirectConvKernelConfig() = default;
+ /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+ *
+ * @param[in] src Source tensor (activation tensor)
+ * @param[in] wei Weights tensor
+ * @param[in] conv_info Convolution info
+ */
+ virtual DirectConvComputeKernelInfo
+ configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3a02a60650
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
+{
+}
+
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+{
+ using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+
+ ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+ &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+ &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+ &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+
+ ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_f32,
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_f16,
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G715:
+ case GPUTarget::G615:
+ func = configs_G715.get_function(lhs->data_type());
+ break;
+ case GPUTarget::G710:
+ default:
+ func = configs_G710.get_function(lhs->data_type());
+ break;
+ }
+
+ const bool adj_lhs = info.adj_lhs();
+ const bool adj_rhs = info.adj_rhs();
+
+ TensorShape lhs_shape = lhs->tensor_shape();
+ TensorShape rhs_shape = rhs->tensor_shape();
+
+ const bool is_batched = lhs_shape.num_dimensions() > 2;
+
+ if (is_batched == true)
+ {
+ lhs_shape.collapse_from(2);
+ }
+
+ const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y();
+ const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x();
+ const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x();
+ const unsigned int b = lhs_shape.z();
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+ return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+ return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ return configure_G715_f32(m, n, k, b, rhs_lock_padding, info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+ return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+ {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1},
+ {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+ {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+ {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0},
+ {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+ {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+ {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1},
+ {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+ {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+ {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0},
+ {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+ {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+ {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1},
+ {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+ {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+ {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0},
+ {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+ {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+ {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1},
+ {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+ {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+ {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0},
+ {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+ {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+ const bool adj_lhs = info.adj_lhs();
+ const bool adj_rhs = info.adj_rhs();
+
+ const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr;
+ const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+ if ((adj_lhs == false) && (adj_rhs == false))
+ {
+ configs_best_to_use = &configs_mnkb_best_nt_nt;
+ configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+ }
+ else if ((adj_lhs == false) && (adj_rhs == true))
+ {
+ configs_best_to_use = &configs_mnkb_best_nt_t;
+ configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+ }
+ else if ((adj_lhs == true) && (adj_rhs == false))
+ {
+ configs_best_to_use = &configs_mnkb_best_t_nt;
+ configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_best_t_t;
+ configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+ }
+
+ MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+ MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+ return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+ {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1},
+ {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+ {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+ {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0},
+ {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+ {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+ {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1},
+ {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+ {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+ {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0},
+ {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+ {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+ {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1},
+ {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+ {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+ {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0},
+ {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+ {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+ {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1},
+ {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+ {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+ {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0},
+ {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+ {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+ const bool adj_lhs = info.adj_lhs();
+ const bool adj_rhs = info.adj_rhs();
+
+ const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr;
+ const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+ if ((adj_lhs == false) && (adj_rhs == false))
+ {
+ configs_best_to_use = &configs_mnkb_best_nt_nt;
+ configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+ }
+ else if ((adj_lhs == false) && (adj_rhs == true))
+ {
+ configs_best_to_use = &configs_mnkb_best_nt_t;
+ configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+ }
+ else if ((adj_lhs == true) && (adj_rhs == false))
+ {
+ configs_best_to_use = &configs_mnkb_best_t_nt;
+ configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+ }
+ else
+ {
+ configs_best_to_use = &configs_mnkb_best_t_t;
+ configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+ }
+
+ MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+ MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+ return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ ARM_COMPUTE_UNUSED(rhs_lock_padding);
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+ {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0},
+ {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+ {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+ {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0},
+ {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+ {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+ {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0},
+ {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+ {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+ const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+ {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0},
+ {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+ {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}};
+
+ const bool adj_lhs = info.adj_lhs();
+ const bool adj_rhs = info.adj_rhs();
+
+ if ((adj_lhs == false) && (adj_rhs == false))
+ {
+ return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
+ }
+ else if ((adj_lhs == false) && (adj_rhs == true))
+ {
+ return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
+ }
+ else if ((adj_lhs == true) && (adj_rhs == false))
+ {
+ return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
+ }
+ else
+ {
+ return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
+ }
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..5279871057
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClMatMulNativeDefaultConfigValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
+
+private:
+ MatMulKernelInfo configure_G710_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G710_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G710_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
new file mode 100644
index 0000000000..3878f698fd
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu)
+ : IClMatMulNativeKernelVariant(gpu)
+{
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulInfo &info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(rhs);
+
+ using VariantFunctionExecutorPtr =
+ MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled);
+
+ ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715(
+ &ClMatMulNativeDefaultVariantValhall::configure_G715_float,
+ &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized);
+
+ ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default(
+ &ClMatMulNativeDefaultVariantValhall::configure_default_float,
+ &ClMatMulNativeDefaultVariantValhall::configure_default_quantized);
+
+ VariantFunctionExecutorPtr func = nullptr;
+ switch (_target)
+ {
+ case GPUTarget::G715:
+ case GPUTarget::G615:
+ func = configs_G715.get_function(lhs->data_type());
+ break;
+ default:
+ func = configs_default.get_function(lhs->data_type());
+ break;
+ }
+
+ const int k = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+ const bool act_enabled = act_info.enabled();
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+ return (this->*func)(k, act_enabled);
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled)
+{
+ // MMUL kernel works only when K is a multiple of 4
+ if (!act_enabled && k % 4 == 0)
+ {
+ return MatMulKernelType::NATIVE_MMUL_FP;
+ }
+
+ return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled)
+{
+ // MMUL kernel works only when K is a multiple of 16
+ if (!act_enabled && k % 16 == 0)
+ {
+ return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+ }
+
+ return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled)
+{
+ ARM_COMPUTE_UNUSED(k, act_enabled);
+
+ return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled)
+{
+ ARM_COMPUTE_UNUSED(k, act_enabled);
+
+ return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
new file mode 100644
index 0000000000..a202676e98
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClMatMulNativeDefaultVariantValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ MatMulKernelType select_kernel(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulInfo &info,
+ const ActivationLayerInfo &act_info) override;
+
+private:
+ MatMulKernelType configure_G715_float(int k, bool act_enabled);
+ MatMulKernelType configure_G715_quantized(int k, bool act_enabled);
+ MatMulKernelType configure_default_float(int k, bool act_enabled);
+ MatMulKernelType configure_default_quantized(int k, bool act_enabled);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
new file mode 100644
index 0000000000..89cad30214
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include <limits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+ const MatMulKernelInfo &info1,
+ unsigned int m,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b,
+ DataType data_type,
+ bool rhs_lock_padding)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+ "The fallback MatMul configuration cannot have export_to_cl_image = true");
+ ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+ "The MatMul configurations must have the same adj_lhs value");
+ ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+ "The MatMul configurations must have the same adj_rhs value");
+
+ const bool adj_lhs = info0.adj_lhs;
+ const bool adj_rhs = info0.adj_rhs;
+
+ TensorInfo lhs_info =
+ !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+ TensorInfo rhs_info =
+ !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+ TensorInfo dst_info;
+
+ if (rhs_lock_padding == false)
+ {
+ if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+ {
+ return info0;
+ }
+ else
+ {
+ return info1;
+ }
+ }
+ else
+ {
+ return info1;
+ }
+}
+
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+ bool adj_lhs,
+ bool adj_rhs,
+ unsigned int m,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b)
+{
+ size_t min_acc = std::numeric_limits<size_t>::max();
+ size_t min_idx = 0;
+
+ ARM_COMPUTE_ERROR_ON(configs.size() == 0);
+ const size_t num_rows = configs.size();
+ const size_t num_cols = configs[0].size();
+
+ ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+ "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+ ARM_COMPUTE_UNUSED(num_cols);
+
+ // Find nearest GeMM workload
+ // Note: the workload does not depend on the K dimension
+ for (size_t y = 0; y < num_rows; ++y)
+ {
+ size_t mc0 = static_cast<size_t>(configs[y][0]);
+ size_t nc0 = static_cast<size_t>(configs[y][1]);
+ size_t kc0 = static_cast<size_t>(configs[y][2]);
+ size_t bc0 = static_cast<size_t>(configs[y][3]);
+
+ size_t acc = 0;
+ acc += (m - mc0) * (m - mc0);
+ acc += (n - nc0) * (n - nc0);
+ acc += (k - kc0) * (k - kc0);
+ acc += (b - bc0) * (b - bc0);
+ acc = std::sqrt(acc);
+ if (acc < min_acc)
+ {
+ min_acc = acc;
+ min_idx = y;
+ }
+ }
+
+ // Get the configuration from the nearest GeMM shape
+ MatMulKernelInfo desc;
+ desc.adj_lhs = adj_lhs;
+ desc.adj_rhs = adj_rhs;
+ desc.m0 = configs[min_idx][4];
+ desc.n0 = configs[min_idx][5];
+ desc.k0 = configs[min_idx][6];
+ desc.export_rhs_to_cl_image = configs[min_idx][7];
+
+ return desc;
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
new file mode 100644
index 0000000000..699f5fe8c1
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct MatMulKernelInfo;
+
+namespace cl_matmul
+{
+using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
+
+/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled.
+ * The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will
+ * return the second MatMulKernelInfo object. Otherwise, the first one.
+ *
+ * @param[in] info0 MatMulKernelInfo with cl_image2d support
+ * @param[in] info1 MatMulKernelInfo to fall-back if cl_image2d cannot be used
+ * @param[in] m Number of rows (M) of the LHS matrix
+ * @param[in] n Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b Batch size
+ * @param[in] data_type Data type
+ * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+ const MatMulKernelInfo &info1,
+ unsigned int m,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b,
+ DataType data_type,
+ bool rhs_lock_padding);
+
+/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
+ *
+ * @param[in] configs List of best configurations for a limited number of MatMul shapes
+ * @param[in] adj_lhs Adjoint LHS flag value
+ * @param[in] adj_rhs Adjoint RHS flag value
+ * @param[in] m Number of rows (M) of the LHS matrix
+ * @param[in] n Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b Batch size
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+ bool adj_lhs,
+ bool adj_rhs,
+ unsigned int m,
+ unsigned int n,
+ unsigned int k,
+ unsigned int b);
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..e7485bca81
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** ClMatMul configuration factory class */
+class ClMatMulNativeKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClMatMulNativeKernelConfig
+ */
+ static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::BIFROST:
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..c2895b8919
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+
+/** ClMatMul variant factory class */
+class ClMatMulNativeKernelVariantFactory final
+{
+public:
+ /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClMatMulNativeKernelVariant
+ */
+ static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu)
+ {
+ switch (get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::BIFROST:
+ case GPUTarget::VALHALL:
+ case GPUTarget::FIFTHGEN:
+ return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..00ba3641d5
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Basic container for the OpenCL MatMul Native configuration functions */
+template <class T>
+class ClMatMulNativeConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+ /** Alias for Int8 index */
+ static constexpr size_t DT_INT8 = 2;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for matmul native F32
+ * @param[in] func_f16 Function to call for matmul native F16
+ * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+ {
+ }
+
+ /** Method to return the matmul native configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_INT8);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 3> _configs;
+};
+
+/** Basic interface for the matmul native kernel configuration
+ * This is the base class that chooses architecture specific kernel configurations.
+*/
+class IClMatMulNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClMatMulNativeKernelConfig() = default;
+ /** This method returns the @ref MatMulKernelInfo for the given inputs
+ *
+ * @param[in] lhs LHS tensor
+ * @param[in] rhs RHS tensor
+ * @param[in] info MatMul info
+ */
+ virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..eac41dd6a3
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+
+#include "arm_compute/core/CoreTypes.h" // DataType
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+enum class MatMulKernelType
+{
+ /** Native matrix multiplication for FP types */
+ NATIVE_FP,
+
+ /** Native matrix multiplication for quantized types */
+ NATIVE_QUANTIZED,
+
+ /** Native matrix multiplication using MMUL extension for FP types */
+ NATIVE_MMUL_FP,
+
+ /** Native matrix multiplication using MMUL extension for Quantized types */
+ NATIVE_MMUL_QUANTIZED
+};
+
+/** Basic container for the OpenCL MatMul Native variant functions */
+template <class T>
+class ClMatMulNativeVariantArray
+{
+public:
+ /** Alias for Float index */
+ static constexpr size_t DT_FLOAT = 0;
+ /** Alias for Quantized type index */
+ static constexpr size_t DT_QUANTIZED = 1;
+
+ /** Constructor
+ *
+ * @param[in] func_float Function to call for matmul native float (F32, F16)
+ * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized}
+ {
+ }
+
+ /** Method to return the matmul native variant function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch (data_type)
+ {
+ case DataType::F32:
+ case DataType::F16:
+ return _configs.at(DT_FLOAT);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_QUANTIZED);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 2> _configs;
+};
+
+/** Basic interface for the matmul native kernel variant
+ * This is the base class that chooses architecture specific kernel variants.
+*/
+class IClMatMulNativeKernelVariant
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant);
+ /** Virtual destructor */
+ virtual ~IClMatMulNativeKernelVariant() = default;
+ /** This method returns the @ref MatMulKernelType for the given inputs
+ *
+ * @param[in] lhs LHS tensor
+ * @param[in] rhs RHS tensor
+ * @param[in] info MatMul info
+ * @param[in] act_info Activation layer info
+ */
+ virtual MatMulKernelType select_kernel(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulInfo &info,
+ const ActivationLayerInfo &act_info) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H